diff --git a/.env.example b/.env.example
index 1a670126..68cf257a 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,57 @@
-SYNTHESIZER_MODEL=
+# Tokenizer
+TOKENIZER_MODEL=
+
+# LLM
+# Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt
+
+# http_api / openai_api
+SYNTHESIZER_BACKEND=openai_api
+SYNTHESIZER_MODEL=gpt-4o-mini
 SYNTHESIZER_BASE_URL=
 SYNTHESIZER_API_KEY=
-TRAINEE_MODEL=
+TRAINEE_BACKEND=openai_api
+TRAINEE_MODEL=gpt-4o-mini
 TRAINEE_BASE_URL=
 TRAINEE_API_KEY=
+
+# azure_openai_api
+# SYNTHESIZER_BACKEND=azure_openai_api
+# The following is the same as your "Deployment name" in Azure
+# SYNTHESIZER_MODEL=<your-deployment-name>
+# SYNTHESIZER_BASE_URL=https://<your-resource-name>.openai.azure.com/openai/deployments/<your-deployment-name>/chat/completions
+# SYNTHESIZER_API_KEY=
+# SYNTHESIZER_API_VERSION=<api-version>
+
+# # ollama_api
+# SYNTHESIZER_BACKEND=ollama_api
+# SYNTHESIZER_MODEL=gemma3
+# SYNTHESIZER_BASE_URL=http://localhost:11434
+#
+# Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+
+# # huggingface
+# SYNTHESIZER_BACKEND=huggingface
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+#
+# TRAINEE_BACKEND=huggingface
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+
+# # sglang
+# SYNTHESIZER_BACKEND=sglang
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_TP_SIZE=1
+# SYNTHESIZER_NUM_GPUS=1
+
+# TRAINEE_BACKEND=sglang
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_TP_SIZE=1
+# SYNTHESIZER_NUM_GPUS=1
+
+# # vllm
+# SYNTHESIZER_BACKEND=vllm
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_NUM_GPUS=1
+
+# TRAINEE_BACKEND=vllm
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# TRAINEE_NUM_GPUS=1
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..74514987
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,24 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..bbcbbe7d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/contributing.md b/.github/contributing.md
new file mode 100644
index 00000000..f0b95d32
--- /dev/null
+++ b/.github/contributing.md
@@ -0,0 +1,34 @@
+## Contribution Guide
+Here are the steps to contribute to this project:
+
+1. Star this repository.
+2. Fork this repository.
+   
+   Type the following command on Git bash console:
+   ```bash
+   git clone https://github.com/open-sciencelab/GraphGen.git
+   ```
+
+3. Create a new branch
+   
+    Now before making changes to the files, go to your terminal under the repo you just cloned, and type the following:
+    
+    ```bash
+    git checkout -b add-my-name
+    ```
+  
+    By running the above command, you just created a new branch called add-my-name and checked it out, what this does is that it creates a new branch with the commit history of the master branch or the branch that you were on previously.
+
+4. Make your changes and push your code.
+
+   ```
+   git add .
+   git commit -m "xxx"
+   git push
+   ```
+
+   This will create a new commit with the changes you made.
+
+5. Now create a pull request and add the title.
+
+     Sit back and relax while your pull request is being reviewed and merged.
diff --git a/.github/sync-config.yml b/.github/sync-config.yml
new file mode 100644
index 00000000..417ed83b
--- /dev/null
+++ b/.github/sync-config.yml
@@ -0,0 +1,17 @@
+sync:
+  - source: graphgen/
+    dest: graphgen/
+  - source: resources/nltk_data/
+    dest: resources/nltk_data/
+  - source: resources/examples/
+    dest: resources/examples/
+  - source: resources/images/logo.png
+    dest: resources/images/logo.png
+  - source: webui/
+    dest: webui/
+  - source: webui/app.py
+    dest: app.py
+  - source: requirements.txt
+    dest: requirements.txt
+  - source: LICENSE
+    dest: LICENSE
diff --git a/.github/workflows/push-to-hf.yml b/.github/workflows/push-to-hf.yml
new file mode 100644
index 00000000..81f23c04
--- /dev/null
+++ b/.github/workflows/push-to-hf.yml
@@ -0,0 +1,51 @@
+name: Push demo branch to Hugging Face
+
+on:
+  workflow_call:
+    inputs:
+      ref:
+        required: false
+        default: demo
+        type: string
+    secrets:
+      HF_TOKEN:
+        required: true
+
+jobs:
+  push-hf:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ inputs.ref }}
+        token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Configure Git identity
+      run: |
+        git config --global user.email "actions@github.com"
+        git config --global user.name  "github-actions[bot]"
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install huggingface_hub
+
+    - name: Push to Hugging Face
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HF_REPO_TYPE: spaces
+        HF_REPO_ID: chenzihong/GraphGen
+      run: |
+        git config --global credential.helper store
+        echo "https://user:${HF_TOKEN}@huggingface.co" > ~/.git-credentials
+
+        [[ -d hf-repo ]] && rm -rf hf-repo
+        git clone https://huggingface.co/${HF_REPO_TYPE}/${HF_REPO_ID} hf-repo
+
+        rsync -a --delete --exclude='.git' --exclude='hf-repo' --exclude='README.md' ./ hf-repo/
+
+        cd hf-repo
+        git add .
+        git diff-index --quiet HEAD || \
+          (git commit -m "Auto-sync from ${{ inputs.ref }} at $(date -u)" && git push)
\ No newline at end of file
diff --git a/.github/workflows/push-to-ms.yml b/.github/workflows/push-to-ms.yml
new file mode 100644
index 00000000..139da12e
--- /dev/null
+++ b/.github/workflows/push-to-ms.yml
@@ -0,0 +1,50 @@
+name: Push demo branch to ModelScope
+
+on:
+  workflow_call:
+    inputs:
+      ref:
+        required: false
+        default: demo
+        type: string
+    secrets:
+      MS_TOKEN:
+        required: true
+
+jobs:
+  push-ms:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Configure Git identity
+        run: |
+          git config --global user.email "actions@github.com"
+          git config --global user.name  "github-actions[bot]"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # ModelScope official SDK (optional, install only if you need to call the platform API)
+          pip install modelscope
+
+      - name: Push to ModelScope
+        env:
+          MS_TOKEN: ${{ secrets.MS_TOKEN }}
+          MS_REPO_TYPE: studios
+          MS_REPO_ID: chenzihong/GraphGen
+        run: |
+          [[ -d ms-repo ]] && rm -rf ms-repo
+          git clone https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git ms-repo
+
+          rsync -a --delete --exclude='.git' --exclude='ms-repo' --exclude='README.md' ./ ms-repo/
+
+          cd ms-repo
+          git add .
+          git diff-index --quiet HEAD || \
+            (git commit -m "Auto-sync from ${{ inputs.ref }} at $(date -u)" && \
+              git push "https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git")
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 1e00ab5b..0cc7dfe1 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/workflow.yml b/.github/workflows/pypi.yml
similarity index 100%
rename from .github/workflows/workflow.yml
rename to .github/workflows/pypi.yml
diff --git a/.github/workflows/sync-demo.yml b/.github/workflows/sync-demo.yml
new file mode 100644
index 00000000..97ee1fed
--- /dev/null
+++ b/.github/workflows/sync-demo.yml
@@ -0,0 +1,96 @@
+name: Sync Demo Branch
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  sync-demo:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout main branch
+      uses: actions/checkout@v4
+      with:
+        ref: main
+        token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Create demo branch if it doesn't exist
+      run: |
+        if ! git ls-remote --heads origin demo | grep -q demo; then
+        echo "Creating demo branch..."
+        git checkout -b demo
+        git push origin demo
+        else
+        echo "Demo branch already exists"
+        fi
+
+    - name: Checkout demo branch
+      uses: actions/checkout@v4
+      with:
+        ref: demo
+        token: ${{ secrets.GITHUB_TOKEN }}
+        path: demo
+
+    - name: Clean demo directory
+      run: |
+        cd demo
+        find . -mindepth 1 -path './.git' -prune -o -exec rm -rf {} + 2>/dev/null || true
+
+    - name: Copy files using config
+      run: |
+        yq eval '.sync[] | .source + ":" + .dest' .github/sync-config.yml | while IFS=: read -r src dst; do
+          src=$(echo "$src" | xargs)
+          dst=$(echo "$dst" | xargs)
+
+          [ -z "$src" ] && continue
+
+          if [ -e "$src" ]; then
+            target_path="demo/$dst"
+
+            # 处理目录（以/结尾或本身是目录）
+            if [[ "$dst" == */ ]] || [ -d "$src" ]; then
+              mkdir -p "$target_path"
+              # 复制目录里的内容，而不是目录本身
+              cp -r "$src"/* "$target_path"
+              echo "Copied $src/* → $target_path"
+            else
+              mkdir -p "$(dirname "$target_path")"
+              cp "$src" "$target_path"
+              echo "Copied $src → $target_path"
+            fi
+          else
+            echo "Source not found: $src"
+          fi
+        done
+
+    - name: Commit and push changes
+      run: |
+        cd demo
+        git config --global user.email "actions@github.com"
+        git config --global user.name  "github-actions[bot]"
+        
+        # 检查是否有变化
+        if [[ -n $(git status --porcelain) ]]; then
+          git add .
+          git commit -m "Auto-sync demo branch with main branch"
+          git push origin demo
+          echo "Changes pushed to demo branch"
+        else
+          echo "No changes to sync"
+        fi
+
+  push-hf:
+    needs: sync-demo
+    uses: ./.github/workflows/push-to-hf.yml
+    secrets:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  push-ms:
+    needs: sync-demo
+    uses: ./.github/workflows/push-to-ms.yml
+    secrets:
+      MS_TOKEN: ${{ secrets.MS_TOKEN }}
+    with:
+      ref: demo
diff --git a/.gitignore b/.gitignore
index 678cdc50..c5ea805f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,6 +174,36 @@ cython_debug/
 .pypirc
 
 cache
+cache_*
+databases/
 *.pyc
 *.html
 .gradio
+*.sst
+*.db/
+*.nhr
+*.ndb
+*.nin
+*.njs
+*.nog
+*.nos
+*.not
+*.nsq
+*.ntf
+*.nto
+*.phr
+*.pin
+*.pdb
+*.pjs
+*.pog
+*.pos
+*.psq
+*.ptf
+*.pto
+*.wal
+graph_kuzu*
+resources/
+
+# macOS
+.DS_Store
+**/.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..5f799fde
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+      - id: black
+        name: black
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort
+  - repo: https://github.com/PyCQA/pylint
+    rev: v2.15.5
+    hooks:
+      - id: pylint
+        name: pylint
+        entry: pylint
+        language: system
+        types: [python]
diff --git a/.pylintrc b/.pylintrc
index 594a7f7f..094d8888 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -100,7 +100,7 @@ source-roots=
 
 # When enabled, pylint would attempt to guess common misconfiguration and emit
 # user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
+# suggestion-mode=yes
 
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
@@ -308,7 +308,7 @@ max-public-methods=20
 max-returns=6
 
 # Maximum number of statements in function / method body.
-max-statements=50
+max-statements=60
 
 # Minimum number of public methods for a class (see R0903).
 min-public-methods=2
@@ -452,6 +452,7 @@ disable=raw-checker-failed,
         R0917,  # Too many positional arguments (6/5) (too-many-positional-arguments)
         C0103,
         E0401,
+        W0718, # Catching too general exception Exception (broad-except)
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..e7e6ecd1
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+chenzihong_gavin@foxmail.com.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/MANIFEST.in b/MANIFEST.in
index b7fd18c5..d1a9003b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
+include requirements.txt
 include graphgen/configs/*.yaml
 recursive-include graphgen/configs *.yaml
diff --git a/README.md b/README.md
index a30b4540..99288683 100644
--- a/README.md
+++ b/README.md
@@ -8,28 +8,34 @@
 [![forks](https://img.shields.io/github/forks/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen)
 [![open issues](https://img.shields.io/github/issues-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues)
 [![issue resolution](https://img.shields.io/github/issues-closed-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues)
-[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://graphgen-cookbook.readthedocs.io/en/latest/)
+[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://chenzihong.gitbook.io/graphgen-cookbook/)
+[![pypi](https://img.shields.io/pypi/v/graphg.svg?style=flat&logo=pypi&logoColor=white)](https://pypi.org/project/graphg/)
 [![wechat](https://img.shields.io/badge/wechat-brightgreen?logo=wechat&logoColor=white)](https://cdn.vansin.top/internlm/dou.jpg)
 [![arXiv](https://img.shields.io/badge/Paper-arXiv-white)](https://arxiv.org/abs/2505.20416)
-[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416)
+[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416)
 
+[![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen)
+[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen)
 
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
 
-<details open>
+[English](README.md) | [中文](README_zh.md)
+
+<details close>
 <summary><b>📚 Table of Contents</b></summary>
 
 - 📝 [What is GraphGen?](#-what-is-graphgen)
-- 🚀 [Quick Start](#-quick-start)
 - 📌 [Latest Updates](#-latest-updates)
+- ⚙️ [Support List](#-support-list)
+- 🚀 [Quick Start](#-quick-start)
 - 🏗️ [System Architecture](#-system-architecture)
 - 🍀 [Acknowledgements](#-acknowledgements)
 - 📚 [Citation](#-citation)
 - 📜 [License](#-license)
+- 📅 [Star History](#-star-history)
 
 [//]: # (- 🌟 [Key Features]&#40;#-key-features&#41;)
-[//]: # (- 📅 [Roadmap]&#40;#-roadmap&#41;)
 [//]: # (- 💰 [Cost Analysis]&#40;#-cost-analysis&#41;)
 [//]: # (- ⚙️ [Configurations]&#40;#-configurations&#41;)
 
@@ -37,30 +43,139 @@ GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthe
 
 ## 📝 What is GraphGen?
 
-GraphGen is a framework for synthetic data generation guided by knowledge graphs. Here is our [**paper**](https://github.com/open-sciencelab/GraphGen/tree/main/resources/GraphGen.pdf) and [best practice](https://github.com/open-sciencelab/GraphGen/issues/17).
+GraphGen is a framework for synthetic data generation guided by knowledge graphs. Please check the [**paper**](https://arxiv.org/abs/2505.20416) and [best practice](https://github.com/open-sciencelab/GraphGen/issues/17).
+
+Here is post-training result which **over 50% SFT data** comes from GraphGen and our data clean pipeline.
+
+|  Domain   |                          Dataset                          |   Ours   | Qwen2.5-7B-Instruct (baseline) |
+|:---------:|:---------------------------------------------------------:|:--------:|:------------------------------:|
+|   Plant   | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** |              51.5              |
+|  Common   |                           CMMLU                           |   73.6   |            **75.8**            |
+| Knowledge |                       GPQA-Diamond                        | **40.0** |              33.3              |
+|   Math    |                          AIME24                           | **20.6** |              16.7              |
+|           |                          AIME25                           | **22.7** |              7.2               |
 
 It begins by constructing a fine-grained knowledge graph from the source text，then identifies knowledge gaps in LLMs using the expected calibration error metric, prioritizing the generation of QA pairs that target high-value, long-tail knowledge.
 Furthermore, GraphGen incorporates multi-hop neighborhood sampling to capture complex relational information and employs style-controlled generation to diversify the resulting QA data.
 
+After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) and [xtuner](https://github.com/InternLM/xtuner) to finetune your LLMs.
+
+## 📌 Latest Updates
+
+- **2025.12.16**: Added [rocksdb](https://github.com/facebook/rocksdb) for key-value storage backend and [kuzudb](https://github.com/kuzudb/kuzu) for graph database backend support.
+- **2025.12.16**: Added [vllm](https://github.com/vllm-project/vllm) for local inference backend support.
+- **2025.12.16**: Refactored the data generation pipeline using [ray](https://github.com/ray-project/ray) to improve the efficiency of distributed execution and resource management.
+
+<details>
+<summary>History</summary>
+
+- **2025.12.1**: Added search support for [NCBI](https://www.ncbi.nlm.nih.gov/) and [RNAcentral](https://rnacentral.org/) databases, enabling extraction of DNA and RNA data from these bioinformatics databases.
+- **2025.10.30**: We support several new LLM clients and inference backends including [Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) and [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py).
+- **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`.
+- **2025.10.21**: We support PDF as input format for data generation now via [MinerU](https://github.com/opendatalab/MinerU).
+- **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
+- **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.
+- **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends.
+- **2025.04.21**: We have released the initial version of GraphGen.
+
+</details>
+
+
+## ⚙️ Support List
+
+We support various LLM inference servers, API servers, inference clients, input file formats, data modalities, output data formats, and output data types.
+Users can flexibly configure according to the needs of synthetic data.
+
+
+| Inference Server                                                         | Api Server                                                                     | Inference Client                                           | Data Source                                                                                                                                                                                                                                                                           | Data Modal    | Data Type                                       |
+|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg]<br>[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | Files(CSV, JSON, PDF, TXT, etc.)<br>Databases([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>Search Engines([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>Knowledge Graphs([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+
+<!-- links -->
+[hf]: https://huggingface.co/docs/transformers/index
+[sg]: https://docs.sglang.ai
+[vllm]: https://github.com/vllm-project/vllm
+[sif]: https://siliconflow.cn
+[oai]: https://openai.com
+[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
+[ol]: https://ollama.com
+[uniprot]: https://www.uniprot.org/
+[ncbi]: https://www.ncbi.nlm.nih.gov/
+[rnacentral]: https://rnacentral.org/
+[wiki]: https://www.wikipedia.org/
+[bing]: https://www.bing.com/
+[google]: https://www.google.com
+
+
+<!-- icons -->
+[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
+[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai
+[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
+[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
+[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
+[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com
+
+[uniprot-icon]: https://www.google.com/s2/favicons?domain=https://www.uniprot.org
+[ncbi-icon]: https://www.google.com/s2/favicons?domain=https://www.ncbi.nlm.nih.gov/
+[rnacentral-icon]: https://www.google.com/s2/favicons?domain=https://rnacentral.org/
+[wiki-icon]: https://www.google.com/s2/favicons?domain=https://www.wikipedia.org/
+[bing-icon]: https://www.google.com/s2/favicons?domain=https://www.bing.com/
+[google-icon]: https://www.google.com/s2/favicons?domain=https://www.google.com
+
+
 ## 🚀 Quick Start
 
-Experience GraphGen through [Web](https://g-app-center-000704-6802-aerppvq.openxlab.space) or [Backup Web Entrance](https://openxlab.org.cn/apps/detail/tpoisonooo/GraphGen)
+Experience GraphGen Demo through [Huggingface](https://huggingface.co/spaces/chenzihong/GraphGen) or [Modelscope](https://modelscope.cn/studios/chenzihong/GraphGen).
 
 For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGen/issues/10), open new [issue](https://github.com/open-sciencelab/GraphGen/issues) or join our [wechat group](https://cdn.vansin.top/internlm/dou.jpg) and ask.
 
-### Gradio Demo
+### Preparation
+
+1. Install [uv](https://docs.astral.sh/uv/reference/installer/)
+
+    ```bash
+    # You could try pipx or pip to install uv when meet network issues, refer the uv doc for more details
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    ```
+2. Clone the repository
+
+    ```bash
+    git clone --depth=1 https://github.com/open-sciencelab/GraphGen
+    cd GraphGen
+    ```
+
+3. Create a new uv environment
+
+    ```bash
+     uv venv --python 3.10
+    ```
+   
+4. Configure the dependencies
 
+    ```bash
+    uv pip install -r requirements.txt
+    ```
+
+### Run Gradio Demo
+
+   ```bash
+   python -m webui.app
+   ```
+   
+   For hot-reload during development, run
    ```bash
-   python webui/app.py
+   PYTHONPATH=. gradio webui/app.py
    ```
 
+
 ![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
 
 ### Run from PyPI
 
 1. Install GraphGen
    ```bash
-   pip install graphg
+   uv pip install graphg
    ```
 
 2. Run in CLI
@@ -76,44 +191,115 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
 
 ### Run from Source
 
-1. Install dependencies
-    ```bash
-    pip install -r requirements.txt
-    ```
-2. Configure the environment
+1. Configure the environment
    - Create an `.env` file in the root directory
      ```bash
      cp .env.example .env
      ```
    - Set the following environment variables:
      ```bash
-     # Synthesizer is the model used to construct KG and generate data
-     SYNTHESIZER_MODEL=your_synthesizer_model_name
-     SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model
-     SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model
-     # Trainee is the model used to train with the generated data
-     TRAINEE_MODEL=your_trainee_model_name
-     TRAINEE_BASE_URL=your_base_url_for_trainee_model
-     TRAINEE_API_KEY=your_api_key_for_trainee_model
+      # Tokenizer
+      TOKENIZER_MODEL=
+      
+      # LLM
+      # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt
+      # Synthesizer is the model used to construct KG and generate data
+      # Trainee is the model used to train with the generated data
+
+      # http_api / openai_api
+      SYNTHESIZER_BACKEND=openai_api
+      SYNTHESIZER_MODEL=gpt-4o-mini
+      SYNTHESIZER_BASE_URL=
+      SYNTHESIZER_API_KEY=
+      TRAINEE_BACKEND=openai_api
+      TRAINEE_MODEL=gpt-4o-mini
+      TRAINEE_BASE_URL=
+      TRAINEE_API_KEY=
+      
+      # azure_openai_api
+      # SYNTHESIZER_BACKEND=azure_openai_api
+      # The following is the same as your "Deployment name" in Azure
+      # SYNTHESIZER_MODEL=<your-deployment-name>
+      # SYNTHESIZER_BASE_URL=https://<your-resource-name>.openai.azure.com/openai/deployments/<your-deployment-name>/chat/completions
+      # SYNTHESIZER_API_KEY=
+      # SYNTHESIZER_API_VERSION=<api-version>
+      
+      # # ollama_api
+      # SYNTHESIZER_BACKEND=ollama_api
+      # SYNTHESIZER_MODEL=gemma3
+      # SYNTHESIZER_BASE_URL=http://localhost:11434
+      #
+      # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+      
+      # # huggingface
+      # SYNTHESIZER_BACKEND=huggingface
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      #
+      # TRAINEE_BACKEND=huggingface
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      
+      # # sglang
+      # SYNTHESIZER_BACKEND=sglang
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # TRAINEE_BACKEND=sglang
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # # vllm
+      # SYNTHESIZER_BACKEND=vllm
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # TRAINEE_BACKEND=vllm
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # TRAINEE_NUM_GPUS=1
      ```
-3. (Optional) If you want to modify the default generated configuration, you can edit the content of the configs/graphgen_config.yaml file.
+2. (Optional) Customize generation parameters in `config.yaml` .
+
+   Edit the corresponding YAML file, e.g.:
+
     ```yaml
-    # configs/graphgen_config.yaml
-    # Example configuration
-    data_type: "raw"
-    input_file: "resources/examples/raw_demo.jsonl"
-    # more configurations...
+      # examples/generate/generate_aggregated_qa/aggregated_config.yaml
+      global_params:
+      working_dir: cache
+      graph_backend: kuzu # graph database backend, support: kuzu, networkx
+      kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+   
+      nodes:
+        - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+          op_name: read
+          type: source
+          dependencies: []
+          params:
+            input_path:
+              - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+      # additional settings...
     ```
-4. Run the generation script
-   ```bash
-   bash scripts/generate.sh
-   ```
-5. Get the generated data
+
+3. Generate data
+
+   Pick the desired format and run the matching script:
+      
+   | Format       | Script to run                                                          | Notes                                                                      |
+   | ------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------- |
+   | `cot`        | `bash examples/generate/generate_cot_qa/generate_cot.sh`               | Chain-of-Thought Q\&A pairs                                                |
+   | `atomic`     | `bash examples/generate/generate_atomic_qa/generate_atomic.sh`         | Atomic Q\&A pairs covering basic knowledge                                 |
+   | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge          |
+   | `multi-hop`  | `examples/generate/generate_multi_hop_qa/generate_multi_hop.sh`        | Multi-hop reasoning Q\&A pairs                                             |
+   | `vqa`        | `bash examples/generate/generate_vqa/generate_vqa.sh`                  | Visual Question Answering pairs combining visual and textual understanding |
+
+
+4. Get the generated data
    ```bash
-   ls cache/data/graphgen
+   ls cache/output
    ```
 
-## Run with Docker
+### Run with Docker
 1. Build the Docker image
    ```bash
    docker build -t graphgen .
@@ -124,10 +310,6 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
     ```
 
 
-## 📌 Latest Updates
-
-- **2025.04.21**: We have released the initial version of GraphGen.
-
 ## 🏗️ System Architecture
 
 See [analysis](https://deepwiki.com/open-sciencelab/GraphGen) by deepwiki for a technical overview of the GraphGen system, its architecture, and core functionalities. 
@@ -138,9 +320,10 @@ See [analysis](https://deepwiki.com/open-sciencelab/GraphGen) by deepwiki for a
 
 
 ## 🍀 Acknowledgements
-- [SiliconCloud](https://siliconflow.cn) Abundant LLM API, some models are free
+- [SiliconFlow](https://siliconflow.cn) Abundant LLM API, some models are free
 - [LightRAG](https://github.com/HKUDS/LightRAG) Simple and efficient graph retrieval solution
-- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) ROGRAG: A Robustly Optimized GraphRAG Framework
+- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) A robustly optimized GraphRAG framework
+- [DB-GPT](https://github.com/eosphoros-ai/DB-GPT) An AI native data app development framework
 
 
 ## 📚 Citation
@@ -159,3 +342,7 @@ If you find this repository useful, please consider citing our work:
 
 ## 📜 License
 This project is licensed under the [Apache License 2.0](LICENSE).
+
+## 📅 Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=InternScience/GraphGen&type=Date)](https://www.star-history.com/#InternScience/GraphGen&Date)
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 00000000..f15f5523
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,346 @@
+<p align="center">
+  <img src="resources/images/logo.png"/>
+</p>
+
+<!-- icon -->
+
+[![stars](https://img.shields.io/github/stars/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen)
+[![forks](https://img.shields.io/github/forks/open-sciencelab/GraphGen.svg)](https://github.com/open-sciencelab/GraphGen)
+[![open issues](https://img.shields.io/github/issues-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/open-sciencelab/GraphGen)](https://github.com/open-sciencelab/GraphGen/issues)
+[![documentation](https://img.shields.io/badge/docs-latest-blue)](https://graphgen-cookbook.readthedocs.io/en/latest/)
+[![pypi](https://img.shields.io/pypi/v/graphg.svg?style=flat&logo=pypi&logoColor=white)](https://pypi.org/project/graphg/)
+[![wechat](https://img.shields.io/badge/wechat-brightgreen?logo=wechat&logoColor=white)](https://cdn.vansin.top/internlm/dou.jpg)
+[![arXiv](https://img.shields.io/badge/Paper-arXiv-white)](https://arxiv.org/abs/2505.20416)
+[![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416)
+
+[![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen)
+[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen)
+
+GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
+
+[English](README.md) | [中文](README_zh.md)
+
+<details close>
+<summary><b>📚 目录</b></summary>
+
+- 📝 [什么是 GraphGen？](#-什么是-graphgen)
+- 📌 [最新更新](#-最新更新)
+- ⚙️ [支持列表](#-支持列表)
+- 🚀 [快速开始](#-快速开始)
+- 🏗️ [系统架构](#-系统架构)
+- 🍀 [致谢](#-致谢)
+- 📚 [引用](#-引用)
+- 📜 [许可证](#-许可证)
+- 📅 [星标历史](#-星标历史)
+
+
+[//]: # (- 🌟 [主要特性](#主要特性))
+[//]: # (- 💰 [成本分析](#成本分析))
+[//]: # (- ⚙️ [配置说明](#配置说明))
+
+</details>
+
+
+## 📝 什么是 GraphGen？
+
+GraphGen 是一个基于知识图谱的数据合成框架。请查看[**论文**](https://arxiv.org/abs/2505.20416)和[最佳实践](https://github.com/open-sciencelab/GraphGen/issues/17)。
+
+以下是在超过 50 % 的 SFT 数据来自 GraphGen 及我们的数据清洗流程时的训练后结果：
+
+| 领域 |                            数据集                            |  我们的方案   | Qwen2.5-7B-Instruct（基线） |
+|:--:|:---------------------------------------------------------:|:--------:|:-----------------------:|
+| 植物 | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** |          51.5           |
+| 常识 |                           CMMLU                           |   73.6   |        **75.8**         |
+| 知识 |                       GPQA-Diamond                        | **40.0** |          33.3           |
+| 数学 |                          AIME24                           | **20.6** |          16.7           |
+|    |                          AIME25                           | **22.7** |           7.2           |
+
+GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期望校准误差指标识别大语言模型中的知识缺口，优先生成针对高价值长尾知识的问答对。  
+此外，GraphGen 采用多跳邻域采样捕获复杂关系信息，并使用风格控制生成来丰富问答数据的多样性。
+
+在数据生成后，您可以使用[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) 和 [xtuner](https://github.com/InternLM/xtuner)对大语言模型进行微调。
+
+## 📌 最新更新
+- **2025.12.16**:新增 [rocksdb](https://github.com/facebook/rocksdb) 作为键值存储后端, [kuzudb](https://github.com/kuzudb/kuzu) 作为图数据库后端的支持。
+- **2025.12.16**:新增 [vllm](https://github.com/vllm-project/vllm) 作为本地推理后端的支持。
+- **2025.12.16**:使用 [ray](https://github.com/ray-project/ray) 重构了数据生成 pipeline，提升了分布式执行和资源管理的效率。
+
+<details>
+<summary>历史更新</summary>
+
+- **2025.12.1**：新增对 [NCBI](https://www.ncbi.nlm.nih.gov/) 和 [RNAcentral](https://rnacentral.org/) 数据库的检索支持，现在可以从这些生物信息学数据库中提取DNA和RNA数据。
+- **2025.10.30**：我们支持多种新的 LLM 客户端和推理后端，包括 [Ollama_client]([Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) 和 [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py)。
+- **2025.10.23**：我们现在支持视觉问答（VQA）数据生成。运行脚本：`bash scripts/generate/generate_vqa.sh`。
+- **2025.10.21**：我们现在通过 [MinerU](https://github.com/opendatalab/MinerU) 支持 PDF 作为数据生成的输入格式。
+- **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
+- **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。
+- **2025.07.31**：新增 Google、Bing、Wikipedia 和 UniProt 作为搜索后端，帮助填补数据缺口。  
+- **2025.04.21**：发布 GraphGen 初始版本。
+
+</details>
+
+## ⚙️ 支持列表
+
+我们支持多种 LLM 推理服务器、API 服务器、推理客户端、输入文件格式、数据模态、输出数据格式和输出数据类型。
+可以根据合成数据的需求进行灵活配置。
+
+| 推理服务器                                                                    | API 服务器                                                                        | 推理客户端                                                      | 输入文件格式                                                                                                                                                                                                                                                   | 数据模态          | 输出数据类型                                          |
+|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg]<br>[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | 文件(CSV, JSON, JSONL, PDF, TXT等)<br>数据库([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>搜索引擎([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>知识图谱([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+
+<!-- links -->
+[hf]: https://huggingface.co/docs/transformers/index
+[sg]: https://docs.sglang.ai
+[vllm]: https://github.com/vllm-project/vllm
+[sif]: https://siliconflow.cn
+[oai]: https://openai.com
+[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
+[ol]: https://ollama.com
+[uniprot]: https://www.uniprot.org/
+[ncbi]: https://www.ncbi.nlm.nih.gov/
+[rnacentral]: https://rnacentral.org/
+[wiki]: https://www.wikipedia.org/
+[bing]: https://www.bing.com/
+[google]: https://www.google.com
+
+
+<!-- icons -->
+[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
+[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai
+[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
+[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
+[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
+[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com
+
+[uniprot-icon]: https://www.google.com/s2/favicons?domain=https://www.uniprot.org
+[ncbi-icon]: https://www.google.com/s2/favicons?domain=https://www.ncbi.nlm.nih.gov/
+[rnacentral-icon]: https://www.google.com/s2/favicons?domain=https://rnacentral.org/
+[wiki-icon]: https://www.google.com/s2/favicons?domain=https://www.wikipedia.org/
+[bing-icon]: https://www.google.com/s2/favicons?domain=https://www.bing.com/
+[google-icon]: https://www.google.com/s2/favicons?domain=https://www.google.com
+
+
+## 🚀 快速开始
+
+通过 [Huggingface](https://huggingface.co/spaces/chenzihong/GraphGen) 或 [Modelscope](https://modelscope.cn/studios/chenzihong/GraphGen) 体验 GraphGen。
+
+如有任何问题，请查看 [FAQ](https://github.com/open-sciencelab/GraphGen/issues/10)、提交新的 [issue](https://github.com/open-sciencelab/GraphGen/issues) 或加入我们的[微信群](https://cdn.vansin.top/internlm/dou.jpg)咨询。
+
+### 准备工作
+
+1. 安装 [uv](https://docs.astral.sh/uv/reference/installer/)
+
+    ```bash
+    # 若遇到网络问题，可尝试使用 pipx 或 pip 安装 uv，详见 uv 文档
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    ```
+2. 克隆仓库
+
+    ```bash
+    git clone --depth=1 https://github.com/open-sciencelab/GraphGen
+    cd GraphGen
+    ```
+3. 创建新的 uv 环境
+
+    ```bash
+    uv venv --python 3.10
+    ```
+4. 安装依赖
+
+    ```bash
+    uv pip install -r requirements.txt
+    ```
+
+### 运行 Gradio 演示
+
+   ```bash
+   python -m webui.app
+   ```
+   
+   如果在开发过程中需要热重载，请运行
+
+   ```bash
+    PYTHONPATH=. gradio webui/app.py
+   ```
+
+
+![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
+
+### 从 PyPI 运行
+
+1. 安装 GraphGen
+   ```bash
+   uv pip install graphg
+   ```
+
+2. CLI 运行
+    ```bash
+    SYNTHESIZER_MODEL=your_synthesizer_model_name \
+    SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model \
+    SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model \
+    TRAINEE_MODEL=your_trainee_model_name \
+    TRAINEE_BASE_URL=your_base_url_for_trainee_model \
+    TRAINEE_API_KEY=your_api_key_for_trainee_model \
+    graphg --output_dir cache
+    ```
+
+### 源码运行
+
+1. 配置环境
+   - 在项目根目录创建 `.env` 文件
+     ```bash
+     cp .env.example .env
+     ```
+   - 设置以下环境变量：
+     ```bash
+      # Tokenizer
+      TOKENIZER_MODEL=
+      
+      # LLM
+      # 支持不同的后端：http_api、openai_api、ollama_api、ollama、huggingface、tgi、sglang、tensorrt
+      # Synthesizer 用于构建知识图谱并生成数据
+      # Trainee 用于使用生成数据进行训练
+
+      # http_api / openai_api
+      SYNTHESIZER_BACKEND=openai_api
+      SYNTHESIZER_MODEL=gpt-4o-mini
+      SYNTHESIZER_BASE_URL=
+      SYNTHESIZER_API_KEY=
+      TRAINEE_BACKEND=openai_api
+      TRAINEE_MODEL=gpt-4o-mini
+      TRAINEE_BASE_URL=
+      TRAINEE_API_KEY=
+      
+      # azure_openai_api
+      # SYNTHESIZER_BACKEND=azure_openai_api
+      # The following is the same as your "Deployment name" in Azure
+      # SYNTHESIZER_MODEL=<your-deployment-name>
+      # SYNTHESIZER_BASE_URL=https://<your-resource-name>.openai.azure.com/openai/deployments/<your-deployment-name>/chat/completions
+      # SYNTHESIZER_API_KEY=
+      # SYNTHESIZER_API_VERSION=<api-version>
+      
+      # # ollama_api
+      # SYNTHESIZER_BACKEND=ollama_api
+      # SYNTHESIZER_MODEL=gemma3
+      # SYNTHESIZER_BASE_URL=http://localhost:11434
+      #
+      # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+      
+      # # huggingface
+      # SYNTHESIZER_BACKEND=huggingface
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      #
+      # TRAINEE_BACKEND=huggingface
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      
+      # # sglang
+      # SYNTHESIZER_BACKEND=sglang
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # TRAINEE_BACKEND=sglang
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # # vllm
+      # SYNTHESIZER_BACKEND=vllm
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_NUM_GPUS=1
+      
+      # TRAINEE_BACKEND=vllm
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # TRAINEE_NUM_GPUS=1
+     ```
+2. （可选）如需修改默认生成配置，可编辑 `graphgen/configs/` 文件夹中的 YAML 文件.
+
+   例如：
+
+    ```yaml
+      # examples/generate/generate_aggregated_qa/aggregated_config.yaml
+      global_params:
+      working_dir: cache
+      graph_backend: kuzu # graph database backend, support: kuzu, networkx
+      kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+   
+      nodes:
+        - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+          op_name: read
+          type: source
+          dependencies: []
+          params:
+            input_path:
+              - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+      # 其他设置...
+    ```
+
+3. 生成数据
+
+   选择所需格式并运行对应脚本：
+   
+   | 格式           | 运行脚本                                                                   | 说明              |
+   | ------------ | ---------------------------------------------------------------------- | --------------- |
+   | `cot`        | `bash examples/generate/generate_cot_qa/generate_cot.sh`               | 思维链问答对          |
+   | `atomic`     | `bash examples/generate/generate_atomic_qa/generate_atomic.sh`         | 覆盖基础知识的原子问答对    |
+   | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | 整合复杂知识的聚合问答对    |
+   | `multi-hop`  | `bash examples/generate/generate_multi_hop_qa/generate_multi_hop.sh`   | 多跳推理问答对         |
+   | `vqa`        | `bash examples/generate/generate_vqa/generate_vqa.sh`                  | 视觉问答对，结合视觉和文本理解 |
+   
+
+
+4. 查看生成结果
+   ```bash
+   ls cache/output
+   ```
+
+### 使用 Docker 运行
+1. 构建镜像
+   ```bash
+   docker build -t graphgen .
+   ```
+2. 启动容器
+   ```bash
+    docker run -p 7860:7860 graphgen
+    ```
+
+
+## 🏗️ 系统架构
+参阅 deepwiki 的[分析](https://deepwiki.com/open-sciencelab/GraphGen)了解 GraphGen 系统、架构与核心功能的技术概览。
+
+
+### 工作流程
+![workflow](resources/images/flow.png)
+
+
+## 🍀 致谢
+- [SiliconFlow](https://siliconflow.cn) 提供丰富的 LLM API，部分模型免费
+- [LightRAG](https://github.com/HKUDS/LightRAG) 简单高效的图检索方案
+- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) 鲁棒优化版 GraphRAG 框架
+- [DB-GPT](https://github.com/eosphoros-ai/DB-GPT) AI 原生数据应用开发框架
+
+
+## 📚 引用
+如果本项目对你有帮助，请考虑引用我们的工作：
+```bibtex
+@misc{chen2025graphgenenhancingsupervisedfinetuning,
+      title={GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation}, 
+      author={Zihong Chen and Wanli Jiang and Jinzhe Li and Zhonghang Yuan and Huanjun Kong and Wanli Ouyang and Nanqing Dong},
+      year={2025},
+      eprint={2505.20416},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.20416}, 
+}
+```
+
+## 📜 许可证
+本项目采用 [Apache License 2.0](LICENSE) 许可证。
+
+## 📅 星标历史
+
+[![Star History Chart](https://api.star-history.com/svg?repos=InternScience/GraphGen&type=Date)](https://www.star-history.com/#InternScience/GraphGen&Date)
+
diff --git a/baselines/BDS/README.md b/baselines/BDS/README.md
new file mode 100644
index 00000000..9ef5abaa
--- /dev/null
+++ b/baselines/BDS/README.md
@@ -0,0 +1,2 @@
+# Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need
+https://arxiv.org/pdf/2507.13966
\ No newline at end of file
diff --git a/graphgen/models/embed/__init__.py b/baselines/BDS/__init__.py
similarity index 100%
rename from graphgen/models/embed/__init__.py
rename to baselines/BDS/__init__.py
diff --git a/baselines/BDS/bds.py b/baselines/BDS/bds.py
new file mode 100644
index 00000000..ea8bf45f
--- /dev/null
+++ b/baselines/BDS/bds.py
@@ -0,0 +1,152 @@
+import argparse
+import asyncio
+import json
+from typing import List
+
+import networkx as nx
+from dotenv import load_dotenv
+from tqdm.asyncio import tqdm as tqdm_async
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.common import init_llm
+from graphgen.models import NetworkXStorage
+from graphgen.utils import create_event_loop
+
+QA_GENERATION_PROMPT = """
+Create an agriculture examination question for advanced agricultural students that tests the relationship between {src} and {tgt}. The relationship is: {path}. The question should:
+    1. Be in multiple choice format (4 options)
+    2. Require agriculture reasoning along the relationship
+    3. Include a brief farm or field scenario
+    4. Not directly mention the relationship in the question stem
+    5. Have one clearly correct answer
+Format:
+    <Question>
+        [Farm or Field Scenario]
+    </Question>
+    <Options>
+        A. [Option]
+        B. [Option]
+        C. [Option]
+        D. [Option]
+    </Options>
+    <Answer>:
+        [Correct Option Letter]
+    </Answer>
+"""
+
+
+def _post_process(text: str) -> dict:
+    try:
+        q = text.split("<Question>")[1].split("</Question>")[0].strip()
+        opts = text.split("<Options>")[1].split("</Options>")[0].strip().splitlines()
+        opts = [o.strip() for o in opts if o.strip()]
+        ans = text.split("<Answer>:")[1].strip()[0].upper()
+        return {
+            "question": q,
+            "options": opts,
+            "answer": ord(ans) - ord("A"),
+            "raw": text,
+        }
+    except Exception as e:  # pylint: disable=broad-except
+        print(f"Error in post-processing: {e}")
+        return {}
+
+
+class BDS:
+    def __init__(self, llm_client: BaseLLMWrapper = None, max_concurrent: int = 1000):
+        self.llm_client: BaseLLMWrapper = llm_client or init_llm("synthesizer")
+        self.max_concurrent: int = max_concurrent
+
+    def generate(self, tasks: List[dict]) -> List[dict]:
+        loop = create_event_loop()
+        return loop.run_until_complete(self._async_generate(tasks))
+
+    async def _async_generate(self, tasks: List[dict]) -> List[dict]:
+        sem = asyncio.Semaphore(self.max_concurrent)
+
+        async def job(item):
+            async with sem:
+                path_str = " -> ".join([f"({h},{r},{t})" for h, r, t in item["path"]])
+                prompt = QA_GENERATION_PROMPT.format(
+                    src=item["src"], tgt=item["tgt"], path=path_str
+                )
+                resp = await self.llm_client.generate_answer(prompt)
+                return _post_process(resp)
+
+        tasks = [job(it) for it in tasks]
+        results = []
+        for coro in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)):
+            try:
+                if r := await coro:
+                    results.append(r)
+            except Exception as e:  # pylint: disable=broad-except
+                print("Error:", e)
+        return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_file",
+        help="GraphML input file path.",
+        default="resources/input_examples/graphml_demo.graphml",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/bds_qa.jsonl",
+        type=str,
+    )
+    args = parser.parse_args()
+
+    load_dotenv()
+
+    bds = BDS()
+
+    graph = NetworkXStorage.load_nx_graph(args.input_file)
+
+    MAX_PATH = 20000
+    all_paths = []
+
+    G = graph.to_directed() if not graph.is_directed() else graph
+    print(G)
+
+    source_nodes = [n for n in G.nodes if G.out_degree(n) > 0][:1000]
+
+    for src in source_nodes:
+        for path in nx.all_simple_paths(G, source=src, target=list(G.nodes), cutoff=3):
+            if len(path) == 4:
+                all_paths.append(path)
+                if len(all_paths) >= MAX_PATH:
+                    break
+            if len(all_paths) >= MAX_PATH:
+                break
+        if len(all_paths) >= MAX_PATH:
+            break
+
+    print(f"Found {len(all_paths)} 4-node paths")
+
+    items = []
+    for path in all_paths:
+        path_edges = []
+        for i in range(len(path) - 1):
+            edge_data = G.get_edge_data(path[i], path[i + 1])
+            if edge_data is None:
+                edge_data = G.get_edge_data(path[i + 1], path[i])
+            if edge_data is None:
+                print(f"Warning: No edge data between {path[i]} and {path[i+1]}")
+                relation = "related_to"
+            else:
+                relation = edge_data.get("relation", "related_to")
+            path_edges.append((path[i], relation, path[i + 1]))
+        items.append({"src": path[0], "tgt": path[-1], "path": path_edges})
+
+    print(f"Prepared {len(items)} items for question generation")
+
+    qa_pairs = bds.generate(items)
+    print(f"Generated {len(qa_pairs)} QA pairs")
+
+    # Save results
+    with open(args.output_file, "w", encoding="utf-8") as f:
+        json.dump(qa_pairs, f, indent=4, ensure_ascii=False)
diff --git a/baselines/EntiGraph/README.md b/baselines/EntiGraph/README.md
new file mode 100644
index 00000000..ae1a53b2
--- /dev/null
+++ b/baselines/EntiGraph/README.md
@@ -0,0 +1,3 @@
+# EntiGraph
+https://arxiv.org/abs/2409.07431
+https://github.com/zitongyang/synthetic_continued_pretraining
\ No newline at end of file
diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py
index 01e22dbc..d04546ef 100644
--- a/baselines/EntiGraph/entigraph.py
+++ b/baselines/EntiGraph/entigraph.py
@@ -1,11 +1,8 @@
-# https://arxiv.org/abs/2409.07431
-# https://github.com/zitongyang/synthetic_continued_pretraining
-
-import os
+import argparse
+import asyncio
 import json
+import os
 import random
-import asyncio
-import argparse
 from hashlib import md5
 
 from tqdm.asyncio import tqdm as tqdm_async
@@ -18,9 +15,9 @@ def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
 
 
-async def generate_entities(document_content: str,
-                      system_message: str,
-                      openai_model: str):
+async def generate_entities(
+    document_content: str, system_message: str, openai_model: str
+):
     prompt = f"""
     ### Document Content:
     {document_content}
@@ -30,23 +27,25 @@ async def generate_entities(document_content: str,
     max_tries = 5
     while not can_read_entities and max_tries > 0:
         try:
-            completion = await gptqa(prompt,
-                               openai_model,
-                               system_message,
-                               json_format=False)
-            completion = completion[completion.find("{"): completion.rfind("}") + 1]
+            completion = await gptqa(
+                prompt, openai_model, system_message, json_format=False
+            )
+            completion = completion[completion.find("{") : completion.rfind("}") + 1]
             response = json.loads(completion)
-            can_read_entities = response['entities']
+            can_read_entities = response["entities"]
             return response
-        except Exception as e: # pylint: disable=broad-except
+        except Exception as e:  # pylint: disable=broad-except
             print(f"Failed to generate entities: {str(e)}")
             max_tries -= 1
 
-async def generate_two_entity_relations(document_content: str,
-                                  entity1: str,
-                                  entity2: str,
-                                  system_message: str,
-                                  openai_model: str):
+
+async def generate_two_entity_relations(
+    document_content: str,
+    entity1: str,
+    entity2: str,
+    system_message: str,
+    openai_model: str,
+):
     prompt = f"""
     ### Document Content:
     {document_content}
@@ -54,17 +53,18 @@ async def generate_two_entity_relations(document_content: str,
     - {entity1}
     - {entity2}
     """
-    completion = await gptqa(prompt,
-                       openai_model,
-                       system_message)
+    completion = await gptqa(prompt, openai_model, system_message)
     return completion
 
-async def generate_three_entity_relations(document_content: str,
-                                    entity1: str,
-                                    entity2: str,
-                                    entity3: str,
-                                    system_message: str,
-                                    openai_model: str):
+
+async def generate_three_entity_relations(
+    document_content: str,
+    entity1: str,
+    entity2: str,
+    entity3: str,
+    system_message: str,
+    openai_model: str,
+):
     prompt = f"""
     ### Document Content:
     {document_content}
@@ -73,11 +73,10 @@ async def generate_three_entity_relations(document_content: str,
     - {entity2}
     - {entity3}
     """
-    completion = await gptqa(prompt,
-                       openai_model,
-                       system_message)
+    completion = await gptqa(prompt, openai_model, system_message)
     return completion
 
+
 def _post_process_synthetic_data(data):
     block = data.split("\n\n")
     qas = {}
@@ -87,7 +86,7 @@ def _post_process_synthetic_data(data):
             answer = line.split("Answer: ")[1]
             qas[compute_content_hash(question)] = {
                 "question": question,
-                "answer": answer
+                "answer": answer,
             }
         break
     return qas
@@ -105,25 +104,26 @@ async def generate_document_entities(doc):
         async with semaphore:
             try:
                 entities = await generate_entities(
-                    doc.text,
-                    task.openai_system_generate_entities,
-                    model_name)
+                    doc.text, task.openai_system_generate_entities, model_name
+                )
                 if not entities:
                     return None
                 return {
-                    'document': doc.text,
-                    'entities': entities['entities'],
-                    'summary': entities['summary']
+                    "document": doc.text,
+                    "entities": entities["entities"],
+                    "summary": entities["summary"],
                 }
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
                 return None
 
     entities_list = []
     for result in tqdm_async(
-            asyncio.as_completed([generate_document_entities(doc) for doc in task.documents]),
-            total=len(task.documents),
-            desc="Generating entities"
+        asyncio.as_completed(
+            [generate_document_entities(doc) for doc in task.documents]
+        ),
+        total=len(task.documents),
+        desc="Generating entities",
     ):
         result = await result
         if result:
@@ -132,38 +132,42 @@ async def generate_document_entities(doc):
     # iterate over triples of entities and generate relations
     pair_list = []
     for doc in entities_list:
-        entities = doc['entities']
+        entities = doc["entities"]
         temp = []
         for i, entity_i in enumerate(entities):
             if i == len(entities) - 1:
                 break
             for j in range(i + 1, len(entities)):
                 entity_j = entities[j]
-                pair = (doc['document'], entity_i, entity_j)
+                pair = (doc["document"], entity_i, entity_j)
                 temp.append(pair)
 
         # Compute all possible combinations of entities is impractical, so we randomly sample 10 pairs
         pair_list.extend(random.sample(temp, min(len(temp), 10)))
 
-
     async def process_two_entity_relations(pair):
         async with semaphore:
             try:
                 document, entity1, entity2 = pair
                 response = await generate_two_entity_relations(
-                    document, entity1, entity2,
+                    document,
+                    entity1,
+                    entity2,
                     task.openai_system_generate_two_entity_relations,
-                    model_name)
+                    model_name,
+                )
                 return response
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
                 return None
 
-    corpus= []
+    corpus = []
     for result in tqdm_async(
-            asyncio.as_completed([process_two_entity_relations(pair) for pair in pair_list]),
-            total=len(pair_list),
-            desc="Generating two entity relations"
+        asyncio.as_completed(
+            [process_two_entity_relations(pair) for pair in pair_list]
+        ),
+        total=len(pair_list),
+        desc="Generating two entity relations",
     ):
         result = await result
         if result:
@@ -194,51 +198,60 @@ async def process_two_entity_relations(pair):
     # ):
     #     corpus.append(await result)
 
-    corpus = [doc['summary'] for doc in entities_list] + corpus
+    corpus = [doc["summary"] for doc in entities_list] + corpus
 
     qa_sft_results = {}
 
     async def generate_qa_sft(content):
         async with semaphore:
-            completion = await gptqa(content, model_name, task.openai_system_quality_qa_sft)
+            completion = await gptqa(
+                content, model_name, task.openai_system_quality_qa_sft
+            )
             return completion
 
-
     for result in tqdm_async(
-            asyncio.as_completed([generate_qa_sft(content) for content in corpus]),
-            total=len(corpus),
-            desc="Generating QA SFT"
+        asyncio.as_completed([generate_qa_sft(content) for content in corpus]),
+        total=len(corpus),
+        desc="Generating QA SFT",
     ):
         try:
             result = await result
             if result:
                 qa_sft_results.update(_post_process_synthetic_data(result))
-        except Exception as e: # pylint: disable=broad-except
+        except Exception as e:  # pylint: disable=broad-except
             print(f"Error: {e}")
 
     return qa_sft_results
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_file',
-                        help='Raw context jsonl path.',
-                        default='resources/examples/chunked_demo.json',
-                        type=str)
-    parser.add_argument('--data_type',
-                        help='Data type of input file. (Raw context or chunked context)',
-                        choices=['raw', 'chunked'],
-                        default='raw',
-                        type=str)
-    parser.add_argument('--output_file',
-                        help='Output file path.',
-                        default='cache/data/entigraph.json',
-                        type=str)
+    parser.add_argument(
+        "--input_file",
+        help="Raw context jsonl path.",
+        default="resources/input_examples/json_demo.json",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_type",
+        help="Data type of input file. (Raw context or chunked context)",
+        choices=["raw", "chunked"],
+        default="raw",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/entigraph.json",
+        type=str,
+    )
 
     args = parser.parse_args()
 
-    results = asyncio.run(generate_synthetic_data_for_document(args.input_file, args.data_type))
+    results = asyncio.run(
+        generate_synthetic_data_for_document(args.input_file, args.data_type)
+    )
 
     # Save results
-    with open(args.output_file, "w", encoding='utf-8') as f:
+    with open(args.output_file, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/Genie/README.md b/baselines/Genie/README.md
new file mode 100644
index 00000000..b28f6428
--- /dev/null
+++ b/baselines/Genie/README.md
@@ -0,0 +1,2 @@
+# Genie
+https://arxiv.org/pdf/2401.14367
\ No newline at end of file
diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py
index 4e740251..972b20fe 100644
--- a/baselines/Genie/genie.py
+++ b/baselines/Genie/genie.py
@@ -1,18 +1,17 @@
-# https://arxiv.org/pdf/2401.14367
-
-import os
-import json
 import argparse
 import asyncio
-from typing import List
+import json
+import os
 from dataclasses import dataclass
-from tqdm.asyncio import tqdm as tqdm_async
+from typing import List
+
 from dotenv import load_dotenv
+from tqdm.asyncio import tqdm as tqdm_async
 
-from graphgen.models import OpenAIModel
-from graphgen.utils import create_event_loop, compute_content_hash
+from graphgen.models import OpenAIClient
+from graphgen.utils import compute_content_hash, create_event_loop
 
-PROMPT_TEMPLATE = '''Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \
+PROMPT_TEMPLATE = """Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \
 in the main point of the document, don't add any additional information that is not in the document. The [question] is \
 by an information-seeking user and the [answer] is provided by a helping AI Agent.
 
@@ -45,20 +44,20 @@
 
 [document]: {doc}
 
-### Response:'''
+### Response:"""
 
 
 def _post_process(content: str) -> tuple:
     if "[question]:" in content and "[answer]:" in content:
-        question = content.split('[question]: ')[1].split('[answer]: ')[0]
-        answer = content.split('[answer]: ')[1]
+        question = content.split("[question]: ")[1].split("[answer]: ")[0]
+        answer = content.split("[answer]: ")[1]
         return question, answer
     return None, None
 
 
 @dataclass
 class Genie:
-    llm_client: OpenAIModel = None
+    llm_client: OpenAIClient = None
     max_concurrent: int = 1000
 
     def generate(self, docs: List[List[dict]]) -> List[dict]:
@@ -77,58 +76,67 @@ async def process_chunk(content: str):
         tasks = []
         for doc in docs:
             for chunk in doc:
-                tasks.append(process_chunk(chunk['content']))
+                tasks.append(process_chunk(chunk["content"]))
 
-        for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie"):
+        for result in tqdm_async(
+            asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie"
+        ):
             try:
                 question, answer = _post_process(await result)
                 if question and answer:
                     final_results[compute_content_hash(question)] = {
-                        'question': question,
-                        'answer': answer
+                        "question": question,
+                        "answer": answer,
                     }
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_file',
-                        help='Raw context jsonl path.',
-                        default='resources/examples/chunked_demo.json',
-                        type=str)
-    parser.add_argument('--data_type',
-                        help='Data type of input file. (Raw context or chunked context)',
-                        choices=['raw', 'chunked'],
-                        default='raw',
-                        type=str)
-    parser.add_argument('--output_file',
-                        help='Output file path.',
-                        default='cache/data/genie.json',
-                        type=str)
+    parser.add_argument(
+        "--input_file",
+        help="Raw context jsonl path.",
+        default="resources/input_examples/json_demo.json",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_type",
+        help="Data type of input file. (Raw context or chunked context)",
+        choices=["raw", "chunked"],
+        default="raw",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/genie.json",
+        type=str,
+    )
 
     args = parser.parse_args()
 
     load_dotenv()
 
-    llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
+    llm_client = OpenAIClient(
+        model=os.getenv("SYNTHESIZER_MODEL"),
         api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
+        base_url=os.getenv("SYNTHESIZER_BASE_URL"),
     )
 
     genie = Genie(llm_client=llm_client)
 
-    if args.data_type == 'raw':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    if args.data_type == "raw":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
-    elif args.data_type == 'chunked':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    elif args.data_type == "chunked":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = json.load(f)
 
     results = genie.generate(data)
 
     # Save results
-    with open(args.output_file, "w", encoding='utf-8') as f:
+    with open(args.output_file, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/LongForm/README.md b/baselines/LongForm/README.md
new file mode 100644
index 00000000..3bc7bbf0
--- /dev/null
+++ b/baselines/LongForm/README.md
@@ -0,0 +1,3 @@
+# LongForm
+https://arxiv.org/pdf/2304.08460
+https://github.com/akoksal/LongForm/tree/main
\ No newline at end of file
diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py
index c37f1e8f..abf4fd3e 100644
--- a/baselines/LongForm/longform.py
+++ b/baselines/LongForm/longform.py
@@ -1,28 +1,26 @@
-# https://arxiv.org/pdf/2304.08460
-# https://github.com/akoksal/LongForm/tree/main
-
-import os
-import json
-from dataclasses import dataclass
 import argparse
 import asyncio
+import json
+import os
+from dataclasses import dataclass
 from typing import List
-from tqdm.asyncio import tqdm as tqdm_async
-from dotenv import load_dotenv
 
-from graphgen.models import OpenAIModel
-from graphgen.utils import create_event_loop, compute_content_hash
+from dotenv import load_dotenv
+from tqdm.asyncio import tqdm as tqdm_async
 
+from graphgen.models import OpenAIClient
+from graphgen.utils import compute_content_hash, create_event_loop
 
-PROMPT_TEMPLATE = '''Instruction: X
+PROMPT_TEMPLATE = """Instruction: X
 Output:{doc}
 
 What kind of instruction could this be the answer to?
-X:'''
+X:"""
+
 
 @dataclass
 class LongForm:
-    llm_client: OpenAIModel = None
+    llm_client: OpenAIClient = None
     max_concurrent: int = 1000
 
     def generate(self, docs: List[List[dict]]) -> List[dict]:
@@ -38,62 +36,73 @@ async def process_chunk(content: str):
                 question = await self.llm_client.generate_answer(content)
                 return {
                     compute_content_hash(question): {
-                        'question': question,
-                        'answer': content
+                        "question": question,
+                        "answer": content,
                     }
                 }
 
         tasks = []
         for doc in docs:
             for chunk in doc:
-                tasks.append(process_chunk(chunk['content']))
+                tasks.append(process_chunk(chunk["content"]))
 
-        for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using LongForm"):
+        for result in tqdm_async(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Generating using LongForm",
+        ):
             try:
                 qa = await result
                 final_results.update(qa)
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_file',
-                        help='Raw context jsonl path.',
-                        default='resources/examples/chunked_demo.json',
-                        type=str)
-    parser.add_argument('--data_type',
-                        help='Data type of input file. (Raw context or chunked context)',
-                        choices=['raw', 'chunked'],
-                        default='raw',
-                        type=str)
-    parser.add_argument('--output_file',
-                        help='Output file path.',
-                        default='cache/data/longform.json',
-                        type=str)
+    parser.add_argument(
+        "--input_file",
+        help="Raw context jsonl path.",
+        default="resources/input_examples/json_demo.json",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_type",
+        help="Data type of input file. (Raw context or chunked context)",
+        choices=["raw", "chunked"],
+        default="raw",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/longform.json",
+        type=str,
+    )
 
     args = parser.parse_args()
 
     load_dotenv()
 
-    llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
+    llm_client = OpenAIClient(
+        model=os.getenv("SYNTHESIZER_MODEL"),
         api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
+        base_url=os.getenv("SYNTHESIZER_BASE_URL"),
     )
 
     longform = LongForm(llm_client=llm_client)
 
-    if args.data_type == 'raw':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    if args.data_type == "raw":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
-    elif args.data_type == 'chunked':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    elif args.data_type == "chunked":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = json.load(f)
 
     results = longform.generate(data)
 
     # Save results
-    with open(args.output_file, "w", encoding='utf-8') as f:
+    with open(args.output_file, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/SELF-QA/README.md b/baselines/SELF-QA/README.md
new file mode 100644
index 00000000..d8966888
--- /dev/null
+++ b/baselines/SELF-QA/README.md
@@ -0,0 +1,2 @@
+# SELF-QA
+https://arxiv.org/abs/2305.11952
\ No newline at end of file
diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py
index ba16b15c..b222d970 100644
--- a/baselines/SELF-QA/self-qa.py
+++ b/baselines/SELF-QA/self-qa.py
@@ -1,18 +1,17 @@
-# https://arxiv.org/abs/2305.11952
-
-import os
+import argparse
+import asyncio
 import json
+import os
 from dataclasses import dataclass
 from typing import List
-import argparse
-import asyncio
-from tqdm.asyncio import tqdm as tqdm_async
+
 from dotenv import load_dotenv
+from tqdm.asyncio import tqdm as tqdm_async
 
-from graphgen.models import OpenAIModel
-from graphgen.utils import create_event_loop, compute_content_hash
+from graphgen.models import OpenAIClient
+from graphgen.utils import compute_content_hash, create_event_loop
 
-INSTRUCTION_GENERATION_PROMPT = '''The background knowledge is:
+INSTRUCTION_GENERATION_PROMPT = """The background knowledge is:
 {doc}
 
 Please generate ten instruction questions as diverse as possible based on the content of the above article.
@@ -22,9 +21,9 @@
 Please generate questions in the following format:
 1. Question: ...
 2. Question: ...
-'''
+"""
 
-READING_COMPREHENSION_PROMPT = '''The background knowledge is:
+READING_COMPREHENSION_PROMPT = """The background knowledge is:
 {doc}
 Please answer the following question based on the content of the article above:
 {question}
@@ -34,27 +33,30 @@
 Please generate the corresponding answer in the following format:
 Question: ...
 Answer: ...
-'''
+"""
+
 
 def _post_process_instructions(content: str) -> list:
-    lines = content.split('\n')
+    lines = content.split("\n")
     questions = []
     for line in lines:
         if "Question:" in line:
-            question = line.split('Question:')[1].strip()
+            question = line.split("Question:")[1].strip()
             questions.append(question)
     return questions
 
+
 def _post_process_answers(content: str) -> tuple:
     if "Question:" in content and "Answer:" in content:
-        question = content.split('Question:')[1].split('Answer:')[0].strip()
-        answer = content.split('Answer:')[1].strip()
+        question = content.split("Question:")[1].split("Answer:")[0].strip()
+        answer = content.split("Answer:")[1].strip()
         return question, answer
     return None, None
 
+
 @dataclass
 class SelfQA:
-    llm_client: OpenAIModel = None
+    llm_client: OpenAIClient = None
     max_concurrent: int = 100
 
     def generate(self, docs: List[List[dict]]) -> List[dict]:
@@ -73,81 +75,102 @@ async def process_chunk(content: str):
                     instruction_questions = _post_process_instructions(response)
 
                     qas = []
-                    for qa in tqdm_async(asyncio.as_completed([
-                        self.llm_client.generate_answer(READING_COMPREHENSION_PROMPT.format(
-                            doc=content,
-                            question=question
-                        )) for question in instruction_questions]),
-                            total=len(instruction_questions), desc="Generating QAs"):
+                    for qa in tqdm_async(
+                        asyncio.as_completed(
+                            [
+                                self.llm_client.generate_answer(
+                                    READING_COMPREHENSION_PROMPT.format(
+                                        doc=content, question=question
+                                    )
+                                )
+                                for question in instruction_questions
+                            ]
+                        ),
+                        total=len(instruction_questions),
+                        desc="Generating QAs",
+                    ):
                         try:
                             question, answer = _post_process_answers(await qa)
                             if question and answer:
-                                qas.append({
-                                    compute_content_hash(question): {
-                                        'question': question,
-                                        'answer': answer
+                                qas.append(
+                                    {
+                                        compute_content_hash(question): {
+                                            "question": question,
+                                            "answer": answer,
+                                        }
                                     }
-                                })
-                        except Exception as e: # pylint: disable=broad-except
+                                )
+                        except Exception as e:  # pylint: disable=broad-except
                             print(f"Error: {e}")
                             continue
                     return qas
-                except Exception as e: # pylint: disable=broad-except
+                except Exception as e:  # pylint: disable=broad-except
                     print(f"Error: {e}")
                     return []
 
         tasks = []
         for doc in docs:
             for chunk in doc:
-                tasks.append(process_chunk(chunk['content']))
+                tasks.append(process_chunk(chunk["content"]))
 
-        for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using SelfQA"):
+        for result in tqdm_async(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Generating using SelfQA",
+        ):
             try:
                 qas = await result
                 for qa in qas:
                     final_results.update(qa)
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_file',
-                        help='Raw context jsonl path.',
-                        default='resources/examples/chunked_demo.json',
-                        type=str)
-    parser.add_argument('--data_type',
-                        help='Data type of input file. (Raw context or chunked context)',
-                        choices=['raw', 'chunked'],
-                        default='raw',
-                        type=str)
-    parser.add_argument('--output_file',
-                        help='Output file path.',
-                        default='cache/data/self-qa.json',
-                        type=str)
+    parser.add_argument(
+        "--input_file",
+        help="Raw context jsonl path.",
+        default="resources/input_examples/json_demo.json",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_type",
+        help="Data type of input file. (Raw context or chunked context)",
+        choices=["raw", "chunked"],
+        default="raw",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/self-qa.json",
+        type=str,
+    )
 
     args = parser.parse_args()
 
     load_dotenv()
 
-    llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
+    llm_client = OpenAIClient(
+        model=os.getenv("SYNTHESIZER_MODEL"),
         api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
+        base_url=os.getenv("SYNTHESIZER_BASE_URL"),
     )
 
     self_qa = SelfQA(llm_client=llm_client)
 
-    if args.data_type == 'raw':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    if args.data_type == "raw":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
-    elif args.data_type == 'chunked':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    elif args.data_type == "chunked":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = json.load(f)
 
     results = self_qa.generate(data)
 
     # Save results
-    with open(args.output_file, "w", encoding='utf-8') as f:
+    with open(args.output_file, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/Wrap/README.md b/baselines/Wrap/README.md
new file mode 100644
index 00000000..f2002627
--- /dev/null
+++ b/baselines/Wrap/README.md
@@ -0,0 +1,2 @@
+# Wrap
+https://arxiv.org/abs/2401.16380
\ No newline at end of file
diff --git a/baselines/Wrap/wrap.py b/baselines/Wrap/wrap.py
index 1bfdcf4a..90ce2a46 100644
--- a/baselines/Wrap/wrap.py
+++ b/baselines/Wrap/wrap.py
@@ -1,19 +1,17 @@
-# https://arxiv.org/abs/2401.16380
-
-import os
-import json
 import argparse
 import asyncio
+import json
+import os
 from dataclasses import dataclass
 from typing import List
+
 from dotenv import load_dotenv
 from tqdm.asyncio import tqdm as tqdm_async
 
-from graphgen.models import OpenAIModel
-from graphgen.utils import create_event_loop, compute_content_hash
+from graphgen.models import OpenAIClient
+from graphgen.utils import compute_content_hash, create_event_loop
 
-
-PROMPT_TEMPLATE = '''A chat between a curious user and an artificial intelligence assistant.
+PROMPT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant.
 The assistant gives helpful, detailed, and polite answers to the questions.
 USER: Convert the following paragraph into a conversational format with
 multiple tags of "Question:" followed by "Answer:":{doc}.
@@ -26,18 +24,19 @@
 ---
 Question: What was the revenue drop in the first quarter compared to the same period last year? Answer: The revenue dropped 15 percent.
 ---
-'''
+"""
+
 
 def _post_process(content: str) -> list:
-    raw_qas = content.split('---')
+    raw_qas = content.split("---")
     qas = []
     for item in raw_qas:
         try:
             if "Question:" in item and "Answer:" in item:
-                question = item.split('Question:')[1].split('Answer:')[0].strip()
-                answer = item.split('Answer:')[1].strip()
+                question = item.split("Question:")[1].split("Answer:")[0].strip()
+                answer = item.split("Answer:")[1].strip()
                 qas.append((question, answer))
-        except Exception as e: # pylint: disable=broad-except
+        except Exception as e:  # pylint: disable=broad-except
             print(f"Error: {e}")
             continue
     return qas
@@ -45,7 +44,7 @@ def _post_process(content: str) -> list:
 
 @dataclass
 class Wrap:
-    llm_client: OpenAIModel = None
+    llm_client: OpenAIClient = None
     max_concurrent: int = 1000
 
     def generate(self, docs: List[List[dict]]) -> List[dict]:
@@ -64,59 +63,67 @@ async def process_chunk(content: str):
         tasks = []
         for doc in docs:
             for chunk in doc:
-                tasks.append(process_chunk(chunk['content']))
+                tasks.append(process_chunk(chunk["content"]))
 
-        for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap"):
+        for result in tqdm_async(
+            asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap"
+        ):
             try:
                 qas = _post_process(await result)
                 for qa in qas:
                     final_results[compute_content_hash(qa[0])] = {
-                        'question': qa[0],
-                        'answer': qa[1]
+                        "question": qa[0],
+                        "answer": qa[1],
                     }
-            except Exception as e: # pylint: disable=broad-except
+            except Exception as e:  # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_file',
-                        help='Raw context jsonl path.',
-                        default='resources/examples/chunked_demo.json',
-                        type=str)
-    parser.add_argument('--data_type',
-                        help='Data type of input file. (Raw context or chunked context)',
-                        choices=['raw', 'chunked'],
-                        default='raw',
-                        type=str)
-    parser.add_argument('--output_file',
-                        help='Output file path.',
-                        default='cache/data/wrap.json',
-                        type=str)
+    parser.add_argument(
+        "--input_file",
+        help="Raw context jsonl path.",
+        default="resources/input_examples/json_demo.json",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_type",
+        help="Data type of input file. (Raw context or chunked context)",
+        choices=["raw", "chunked"],
+        default="raw",
+        type=str,
+    )
+    parser.add_argument(
+        "--output_file",
+        help="Output file path.",
+        default="cache/data/wrap.json",
+        type=str,
+    )
 
     args = parser.parse_args()
 
     load_dotenv()
 
-    llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
+    llm_client = OpenAIClient(
+        model=os.getenv("SYNTHESIZER_MODEL"),
         api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
+        base_url=os.getenv("SYNTHESIZER_BASE_URL"),
     )
 
     wrap = Wrap(llm_client=llm_client)
 
-    if args.data_type == 'raw':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    if args.data_type == "raw":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
-    elif args.data_type == 'chunked':
-        with open(args.input_file, "r", encoding='utf-8') as f:
+    elif args.data_type == "chunked":
+        with open(args.input_file, "r", encoding="utf-8") as f:
             data = json.load(f)
 
     results = wrap.generate(data)
 
     # Save results
-    with open(args.output_file, "w", encoding='utf-8') as f:
+    with open(args.output_file, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/scripts/baselines/generate_all_baselines.sh b/examples/baselines/generate_all_baselines.sh
similarity index 100%
rename from scripts/baselines/generate_all_baselines.sh
rename to examples/baselines/generate_all_baselines.sh
diff --git a/examples/baselines/generate_bds.sh b/examples/baselines/generate_bds.sh
new file mode 100644
index 00000000..d4bd9e8c
--- /dev/null
+++ b/examples/baselines/generate_bds.sh
@@ -0,0 +1,2 @@
+python3 -m baselines.BDS.bds --input_file resources/input_examples/graphml_demo.graphml \
+                                --output_file cache/data/bds.json \
diff --git a/scripts/baselines/generate_entigraph.sh b/examples/baselines/generate_entigraph.sh
similarity index 85%
rename from scripts/baselines/generate_entigraph.sh
rename to examples/baselines/generate_entigraph.sh
index ce9cc991..8474c96f 100644
--- a/scripts/baselines/generate_entigraph.sh
+++ b/examples/baselines/generate_entigraph.sh
@@ -1,3 +1,3 @@
-python3 -m baselines.EntiGraph.entigraph --input_file resources/examples/raw_demo.jsonl \
+python3 -m baselines.EntiGraph.entigraph --input_file resources/input_examples/raw_demo.jsonl \
                                 --data_type raw \
                                 --output_file cache/data/entigraph.json \
diff --git a/scripts/baselines/generate_genie.sh b/examples/baselines/generate_genie.sh
similarity index 57%
rename from scripts/baselines/generate_genie.sh
rename to examples/baselines/generate_genie.sh
index 0119930d..3a06de10 100644
--- a/scripts/baselines/generate_genie.sh
+++ b/examples/baselines/generate_genie.sh
@@ -1,3 +1,3 @@
-python3 -m baselines.Genie.genie --input_file resources/examples/raw_demo.jsonl \
+python3 -m baselines.Genie.genie --input_file resources/input_examples/raw_demo.jsonl \
                                 --data_type raw \
                                 --output_file cache/data/genie.json \
diff --git a/scripts/baselines/generate_longform.sh b/examples/baselines/generate_longform.sh
similarity index 56%
rename from scripts/baselines/generate_longform.sh
rename to examples/baselines/generate_longform.sh
index d7ed70c5..62de848c 100644
--- a/scripts/baselines/generate_longform.sh
+++ b/examples/baselines/generate_longform.sh
@@ -1,3 +1,3 @@
-python3 -m baselines.LongForm.longform --input_file resources/examples/raw_demo.jsonl \
+python3 -m baselines.LongForm.longform --input_file resources/input_examples/raw_demo.jsonl \
                                 --data_type raw \
                                 --output_file cache/data/longform.json \
diff --git a/scripts/baselines/generate_selfqa.sh b/examples/baselines/generate_selfqa.sh
similarity index 57%
rename from scripts/baselines/generate_selfqa.sh
rename to examples/baselines/generate_selfqa.sh
index 18eb7b1f..ef13e721 100644
--- a/scripts/baselines/generate_selfqa.sh
+++ b/examples/baselines/generate_selfqa.sh
@@ -1,3 +1,3 @@
-python3 -m baselines.SELF-QA.self-qa --input_file resources/examples/raw_demo.jsonl \
+python3 -m baselines.SELF-QA.self-qa --input_file resources/input_examples/raw_demo.jsonl \
                                 --data_type raw \
                                 --output_file cache/data/self-qa.json \
diff --git a/scripts/baselines/generate_wrap.sh b/examples/baselines/generate_wrap.sh
similarity index 58%
rename from scripts/baselines/generate_wrap.sh
rename to examples/baselines/generate_wrap.sh
index f10857a5..fcaf3933 100644
--- a/scripts/baselines/generate_wrap.sh
+++ b/examples/baselines/generate_wrap.sh
@@ -1,3 +1,3 @@
-python3 -m baselines.Wrap.wrap --input_file resources/examples/raw_demo.jsonl \
+python3 -m baselines.Wrap.wrap --input_file resources/input_examples/raw_demo.jsonl \
                                 --data_type raw \
                                 --output_file cache/data/wrap.json \
diff --git a/scripts/evaluate.sh b/examples/evaluate/evaluate.sh
similarity index 82%
rename from scripts/evaluate.sh
rename to examples/evaluate/evaluate.sh
index 25706d48..2b352669 100644
--- a/scripts/evaluate.sh
+++ b/examples/evaluate/evaluate.sh
@@ -1,4 +1,3 @@
 python3 -m graphgen.evaluate --folder cache/data \
-                    --output cache/output \
                     --reward "OpenAssistant/reward-model-deberta-v3-large-v2,BAAI/IndustryCorpus2_DataRater" \
                     --uni MingZhong/unieval-sum \
diff --git a/examples/extract/extract_schema_guided/README.md b/examples/extract/extract_schema_guided/README.md
new file mode 100644
index 00000000..ab117c0f
--- /dev/null
+++ b/examples/extract/extract_schema_guided/README.md
@@ -0,0 +1 @@
+# Extract Schema-Guided Information from Documents
diff --git a/examples/extract/extract_schema_guided/extract_schema_guided.sh b/examples/extract/extract_schema_guided/extract_schema_guided.sh
new file mode 100644
index 00000000..d9890567
--- /dev/null
+++ b/examples/extract/extract_schema_guided/extract_schema_guided.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml
diff --git a/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml b/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml
new file mode 100644
index 00000000..1a25e196
--- /dev/null
+++ b/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml
@@ -0,0 +1,35 @@
+global_params:
+  working_dir: cache
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/extract_demo.txt
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 20480 # larger chunk size for better context
+      chunk_overlap: 2000
+
+  - id: extract
+    op_name: extract
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: schema_guided
+      schema_path: graphgen/templates/extraction/schemas/legal_contract.json
diff --git a/examples/generate/generate_aggregated_qa/README.md b/examples/generate/generate_aggregated_qa/README.md
new file mode 100644
index 00000000..ab08693b
--- /dev/null
+++ b/examples/generate/generate_aggregated_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Aggregated QAs
+
+Aggregated mode is one of three question-answering scenarios in GraphGen (alongside atomic and multi-hop) designed to generate synthetic training data that incorporates complex, integrated knowledge from multiple sources.
\ No newline at end of file
diff --git a/examples/generate/generate_aggregated_qa/aggregated_config.yaml b/examples/generate/generate_aggregated_qa/aggregated_config.yaml
new file mode 100644
index 00000000..5957dff0
--- /dev/null
+++ b/examples/generate/generate_aggregated_qa/aggregated_config.yaml
@@ -0,0 +1,79 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: aggregate
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+      concurrency_limit: 200
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_aggregated_qa/generate_aggregated.sh b/examples/generate/generate_aggregated_qa/generate_aggregated.sh
new file mode 100644
index 00000000..9169896b
--- /dev/null
+++ b/examples/generate/generate_aggregated_qa/generate_aggregated.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_aggregated_qa/aggregated_config.yaml
diff --git a/examples/generate/generate_atomic_qa/README.md b/examples/generate/generate_atomic_qa/README.md
new file mode 100644
index 00000000..e979b182
--- /dev/null
+++ b/examples/generate/generate_atomic_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Atomic QAs
+
+Atomic mode generates question-answer pairs that test basic, isolated knowledge from individual facts or relationships in the knowledge graph. 
\ No newline at end of file
diff --git a/examples/generate/generate_atomic_qa/atomic_config.yaml b/examples/generate/generate_atomic_qa/atomic_config.yaml
new file mode 100644
index 00000000..826302d9
--- /dev/null
+++ b/examples/generate/generate_atomic_qa/atomic_config.yaml
@@ -0,0 +1,55 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/json_demo.json
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024
+      chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    dependencies:
+      - chunk
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: dfs
+      method_params:
+        max_units_per_community: 1
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: atomic
+      data_format: Alpaca
diff --git a/examples/generate/generate_atomic_qa/generate_atomic.sh b/examples/generate/generate_atomic_qa/generate_atomic.sh
new file mode 100644
index 00000000..7f5339f5
--- /dev/null
+++ b/examples/generate/generate_atomic_qa/generate_atomic.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_atomic_qa/atomic_config.yaml
diff --git a/examples/generate/generate_cot_qa/README.md b/examples/generate/generate_cot_qa/README.md
new file mode 100644
index 00000000..37afe9c7
--- /dev/null
+++ b/examples/generate/generate_cot_qa/README.md
@@ -0,0 +1 @@
+# Generate CoT QAs
diff --git a/examples/generate/generate_cot_qa/cot_config.yaml b/examples/generate/generate_cot_qa/cot_config.yaml
new file mode 100644
index 00000000..bb9b49c7
--- /dev/null
+++ b/examples/generate/generate_cot_qa/cot_config.yaml
@@ -0,0 +1,57 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/txt_demo.txt
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024
+      chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    dependencies:
+      - chunk
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: leiden
+      method_params:
+        max_size: 20
+        use_lcc: false
+        random_seed: 42
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: cot
+      data_format: Sharegpt
diff --git a/examples/generate/generate_cot_qa/generate_cot.sh b/examples/generate/generate_cot_qa/generate_cot.sh
new file mode 100644
index 00000000..78d17491
--- /dev/null
+++ b/examples/generate/generate_cot_qa/generate_cot.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_cot_qa/cot_config.yaml
diff --git a/examples/generate/generate_multi_hop_qa/README.md b/examples/generate/generate_multi_hop_qa/README.md
new file mode 100644
index 00000000..dcee73be
--- /dev/null
+++ b/examples/generate/generate_multi_hop_qa/README.md
@@ -0,0 +1 @@
+# Generate Multi-hop QAs
diff --git a/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh
new file mode 100644
index 00000000..3730ce9e
--- /dev/null
+++ b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_hop_qa/multi_hop_config.yaml
diff --git a/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml
new file mode 100644
index 00000000..a5f42b40
--- /dev/null
+++ b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml
@@ -0,0 +1,58 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/csv_demo.csv
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024
+      chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: ece
+      method_params:
+        max_units_per_community: 3
+        min_units_per_community: 3
+        max_tokens_per_community: 10240
+        unit_sampling: random
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: multi_hop
+      data_format: ChatML
diff --git a/examples/generate/generate_omics_qa/README.md b/examples/generate/generate_omics_qa/README.md
new file mode 100644
index 00000000..9aca9561
--- /dev/null
+++ b/examples/generate/generate_omics_qa/README.md
@@ -0,0 +1,216 @@
+# Multi-omics Knowledge Graph QA Generation
+
+This example demonstrates how to build knowledge graphs from multi-omics data (DNA, RNA, protein) and generate question-answer pairs using the unified `omics_qa` method.
+
+## Pipeline Overview
+
+The pipeline includes the following steps:
+
+1. **read**: Read input files (JSON/JSONL format with sequence queries or protein data)
+2. **search**: Search biological databases (NCBI for DNA, RNAcentral for RNA, UniProt for protein) - *optional if input already contains search results*
+3. **chunk**: Chunk sequences and metadata
+4. **build_kg**: Extract entities and relationships to build knowledge graph
+5. **partition**: Partition the knowledge graph into communities using anchor-based BFS
+6. **generate**: Generate QA pairs from partitioned communities with automatic molecule caption extraction
+
+## Key Features
+
+- **Unified QA Generation**: Single `omics_qa` method supports DNA, RNA, and Protein
+- **Automatic Caption Extraction**: Automatically extracts and attaches molecule-specific information (dna/rna/protein captions) to each QA pair
+- **Flexible Configuration**: Easy to switch between DNA, RNA, and Protein by changing input file and data source
+- **Anchor-based Partitioning**: Uses molecule type as anchor for BFS partitioning (dna/rna/protein)
+
+## Quick Start
+
+### 1. Configure Input Data
+
+Edit `omics_qa_config.yaml` to set the input file path:
+
+**For DNA:**
+```yaml
+input_path:
+  - examples/input_examples/search_dna_demo.jsonl
+```
+
+**For RNA:**
+```yaml
+input_path:
+  - examples/input_examples/search_rna_demo.jsonl
+```
+
+**For Protein:**
+```yaml
+input_path:
+  - examples/input_examples/search_protein_demo.jsonl
+```
+
+### 2. Configure Data Source
+
+Set the appropriate data source and parameters in the `search_data` node:
+
+**For DNA (NCBI):**
+```yaml
+data_sources: [ncbi]
+ncbi_params:
+  email: your_email@example.com  # Required!
+  tool: GraphGen
+  use_local_blast: true
+  local_blast_db: refseq_release/refseq_release
+  blast_num_threads: 2
+  max_concurrent: 5
+```
+
+**For RNA (RNAcentral):**
+```yaml
+data_sources: [rnacentral]
+rnacentral_params:
+  use_local_blast: true
+  local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD
+  blast_num_threads: 2
+  max_concurrent: 5
+```
+
+**For Protein (UniProt):**
+```yaml
+data_sources: [uniprot]
+uniprot_params:
+  use_local_blast: true
+  local_blast_db: ${RELEASE}/uniprot_sprot
+  blast_num_threads: 2
+  max_concurrent: 5
+```
+
+### 3. Configure Anchor Type
+
+Set the `anchor_type` in the `partition` node to match your molecule type:
+
+```yaml
+partition:
+  params:
+    method: anchor_bfs
+    method_params:
+      anchor_type: protein  # Change to "dna" or "rna" as needed
+      max_units_per_community: 10
+```
+
+### 4. Run the Pipeline
+
+```bash
+./generate_omics_qa.sh
+```
+
+Or run directly with Python:
+
+```bash
+python3 -m graphgen.run \
+  --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \
+  --output_dir cache/
+```
+
+## Input Format
+
+### For DNA/RNA (JSONL format):
+```jsonl
+{"type": "text", "content": "BRCA1"}
+{"type": "text", "content": ">query\nATGCGATCG..."}
+{"type": "text", "content": "ATGCGATCG..."}
+```
+
+### For Protein (JSONL format):
+```jsonl
+{"type": "text", "content": "P01308"}
+{"type": "text", "content": "insulin"}
+{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
+```
+
+## Output Format
+
+The `omics_qa` method automatically extracts and attaches molecule-specific captions to QA pairs:
+
+### Alpaca Format:
+```json
+{
+  "instruction": "What is the function of this protein?",
+  "input": "",
+  "output": "The protein functions as...",
+  "dna": {...},      # DNA caption (if molecule_type is DNA)
+  "rna": {...},      # RNA caption (if molecule_type is RNA)
+  "protein": {...}   # Protein caption (if molecule_type is protein)
+}
+```
+
+### ChatML Format:
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "text": "What is the function of this protein?",
+          "dna": {...},
+          "rna": {...},
+          "protein": {...}
+        }
+      ]
+    },
+    {
+      "role": "assistant",
+      "content": "The protein functions as..."
+    }
+  ]
+}
+```
+
+## Caption Information
+
+The generator automatically extracts relevant caption information based on molecule type:
+
+- **DNA**: gene_name, gene_description, organism, chromosome, genomic_location, function, gene_type, etc.
+- **RNA**: rna_type, description, organism, related_genes, gene_name, so_term, modifications, etc.
+- **Protein**: protein_name, gene_names, organism, function, sequence, entry_name, etc.
+
+## Configuration Options
+
+### Chunking Parameters
+- `chunk_size`: Size for text metadata chunks (default: 1024)
+- `chunk_overlap`: Overlap for text chunks (default: 100)
+- `sequence_chunk_size`: Size for sequence chunks (default: 1000)
+- `sequence_chunk_overlap`: Overlap for sequence chunks (default: 100)
+
+### Partition Parameters
+- `method`: `anchor_bfs` (recommended for omics data)
+- `anchor_type`: `dna`, `rna`, or `protein` (must match your data type)
+- `max_units_per_community`: Maximum nodes and edges per community (default: 10)
+
+### Generation Parameters
+- `method`: `omics_qa` (unified method for DNA/RNA/Protein)
+- `data_format`: `Alpaca`, `ChatML`, or `Sharegpt`
+
+## Notes
+
+- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params`
+- **Anchor type must match molecule type** - Set `anchor_type` to match your data (dna/rna/protein)
+- **Local BLAST** can be enabled if you have local databases set up (see `examples/search/build_db/`)
+- **Caption extraction** is automatic - The generator detects molecule type and extracts relevant caption information
+- Adjust `max_concurrent` based on your system resources and API rate limits
+
+## Examples
+
+### Generate QA for Protein Data
+1. Set `input_path` to `examples/input_examples/search_protein_demo.jsonl`
+2. Set `data_sources: [uniprot]`
+3. Set `anchor_type: protein`
+4. Run `./generate_omics_qa.sh`
+
+### Generate QA for DNA Data
+1. Set `input_path` to `examples/input_examples/search_dna_demo.jsonl`
+2. Set `data_sources: [ncbi]`
+3. Set `anchor_type: dna`
+4. Run `./generate_omics_qa.sh`
+
+### Generate QA for RNA Data
+1. Set `input_path` to `examples/input_examples/search_rna_demo.jsonl`
+2. Set `data_sources: [rnacentral]`
+3. Set `anchor_type: rna`
+4. Run `./generate_omics_qa.sh`
diff --git a/examples/generate/generate_omics_qa/generate_omics_qa.sh b/examples/generate/generate_omics_qa/generate_omics_qa.sh
new file mode 100755
index 00000000..3afb129e
--- /dev/null
+++ b/examples/generate/generate_omics_qa/generate_omics_qa.sh
@@ -0,0 +1,3 @@
+python3 -m graphgen.run \
+  --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \
+  --output_dir cache/
diff --git a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh
new file mode 100755
index 00000000..20b0b533
--- /dev/null
+++ b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh
@@ -0,0 +1,3 @@
+python3 -m graphgen.run \
+  --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml \
+  --output_dir cache/
diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml
new file mode 100644
index 00000000..22b9b26c
--- /dev/null
+++ b/examples/generate/generate_omics_qa/omics_qa_config.yaml
@@ -0,0 +1,93 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        # three input files to generate DNA, RNA, and Protein data together
+        - examples/input_examples/search_dna_demo.jsonl
+        - examples/input_examples/search_rna_demo.jsonl
+        - examples/input_examples/search_protein_demo.jsonl
+
+  - id: search_data
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    params:
+      data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources
+      # DNA search parameters
+      ncbi_params:
+        email: your_email@example.com # Required for NCBI
+        tool: GraphGen
+        use_local_blast: true
+        local_blast_db: databases/refseq_232_old/refseq_232
+        blast_num_threads: 2
+        max_concurrent: 5
+      # RNA search parameters
+      rnacentral_params:
+        use_local_blast: true
+        local_blast_db: databases/rnacentral_merged_20251213/rnacentral_merged_20251213
+        blast_num_threads: 2
+        max_concurrent: 5
+      # Protein search parameters
+      uniprot_params:
+        use_local_blast: true
+        # local_blast_db: ${RELEASE}/uniprot_sprot
+        local_blast_db: databases/2025_04/uniprot_sprot
+        blast_num_threads: 2
+        max_concurrent: 5 
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - search_data
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+      sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein)
+      sequence_chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: anchor_bfs # partition method
+      method_params:
+        anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
+        max_units_per_community: 10 # max nodes and edges per community
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: omics_qa # unified QA generation method for DNA/RNA/Protein
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml
new file mode 100644
index 00000000..cf01bc65
--- /dev/null
+++ b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml
@@ -0,0 +1,73 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        # Use pre-searched data files (skip search step)
+        # The search_service will automatically detect and skip search if data already contains search results
+        - examples/input_examples/searched_dna_demo.jsonl
+        - examples/input_examples/searched_rna_demo.jsonl
+        - examples/input_examples/searched_protein_demo.jsonl
+
+  - id: search_data
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    # Note: search_service will automatically detect pre-searched data and skip search,
+    # but it will still normalize the data format (ensure _doc_id, content, data_source fields exist)
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - search_data
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+      sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein)
+      sequence_chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: anchor_bfs # partition method
+      method_params:
+        anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
+        max_units_per_community: 10 # max nodes and edges per community
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: omics_qa # unified QA generation method for DNA/RNA/Protein
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_vqa/README.md b/examples/generate/generate_vqa/README.md
new file mode 100644
index 00000000..42b13865
--- /dev/null
+++ b/examples/generate/generate_vqa/README.md
@@ -0,0 +1 @@
+# Generate VQAs
\ No newline at end of file
diff --git a/examples/generate/generate_vqa/generate_vqa.sh b/examples/generate/generate_vqa/generate_vqa.sh
new file mode 100644
index 00000000..9cbb3d40
--- /dev/null
+++ b/examples/generate/generate_vqa/generate_vqa.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_vqa/vqa_config.yaml
diff --git a/examples/generate/generate_vqa/vqa_config.yaml b/examples/generate/generate_vqa/vqa_config.yaml
new file mode 100644
index 00000000..7a869fe5
--- /dev/null
+++ b/examples/generate/generate_vqa/vqa_config.yaml
@@ -0,0 +1,59 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/vqa_demo.json
+      modalities:
+        - text
+        - image
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 1024
+      chunk_overlap: 100
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - build_kg
+    params:
+      method: anchor_bfs
+      method_params:
+        anchor_type: image
+        max_units_per_community: 10
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: vqa
+      data_format: ChatML
\ No newline at end of file
diff --git a/examples/input_examples/csv_demo.csv b/examples/input_examples/csv_demo.csv
new file mode 100644
index 00000000..e6c1b521
--- /dev/null
+++ b/examples/input_examples/csv_demo.csv
@@ -0,0 +1,5 @@
+type,content
+text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。
+text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。
+text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."
+text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."
diff --git a/examples/input_examples/extract_demo.txt b/examples/input_examples/extract_demo.txt
new file mode 100644
index 00000000..7d399227
--- /dev/null
+++ b/examples/input_examples/extract_demo.txt
@@ -0,0 +1,16 @@
+LEASE AGREEMENT
+This Lease Agreement (“Agreement”) is entered into on 15 March 2024 (“Signing Date”) by and between:
+Lessor:
+Riverside Commercial LLC, a Delaware limited-liability company with principal offices at 1800 Riverfront Drive, Wilmington, Delaware 19801, USA.
+Lessee:
+BrightWave Technologies Inc., a California corporation having its registered address at 2500 Innovation Way, San Jose, California 95110, USA.
+Leased Space:
+Approximately 5,200 rentable square feet on the entire third (3rd) floor of the Building known as Riverfront Plaza, 1800 Riverfront Drive, Wilmington, Delaware 19801, consisting of open-plan office areas, three private offices, one conference room, a kitchenette, and associated common-area restrooms.
+Designated Use:
+General office, research & development, and ancillary administrative purposes related to software and hardware technology businesses, and for no other purpose without prior written consent of Lessor.
+Term:
+The lease shall commence on 1 May 2024 (“Start Date”) and shall expire at 11:59 p.m. on 30 April 2029 (This is “End Date” or “Expiration Date of Lease”), unless sooner terminated in accordance with the terms hereof.
+Term of Payment:
+Base rent in the amount of Twenty-Five Thousand and 00/100 United States Dollars (US $25,000.00) per calendar month, due in advance on the first (1st) day of each month without demand, deduction, or set-off. Rent shall be paid by ACH transfer to the account designated in writing by Lessor. In addition, Lessee shall pay its proportionate share of Operating Expenses and Real Estate Taxes as defined in Section 5 of this Agreement.
+Extension Period:
+Provided Lessee is not in default beyond any applicable cure period, Lessee may extend this Lease for one (1) additional period of five (5) years by giving Lessor written notice at least twelve (12) months but not more than fifteen (15) months prior to the Expiration Date of Lease (i.e., 30 April 2029). The rent during the extension period shall be adjusted to the then-prevailing market rate as determined under Section 18.2 hereof.
\ No newline at end of file
diff --git a/examples/input_examples/graphml_demo.graphml b/examples/input_examples/graphml_demo.graphml
new file mode 100644
index 00000000..9d90c84f
--- /dev/null
+++ b/examples/input_examples/graphml_demo.graphml
@@ -0,0 +1,362 @@
+<?xml version='1.0' encoding='utf-8'?>
+<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"><key id="d8" for="edge" attr.name="length" attr.type="long"/>
+<key id="d7" for="edge" attr.name="loss" attr.type="double"/>
+<key id="d6" for="edge" attr.name="description" attr.type="string"/>
+<key id="d5" for="edge" attr.name="source_id" attr.type="string"/>
+<key id="d4" for="node" attr.name="length" attr.type="long"/>
+<key id="d3" for="node" attr.name="loss" attr.type="double"/>
+<key id="d2" for="node" attr.name="source_id" attr.type="string"/>
+<key id="d1" for="node" attr.name="description" attr.type="string"/>
+<key id="d0" for="node" attr.name="entity_type" attr.type="string"/>
+<graph edgedefault="undirected"><node id="&quot;农业技术&quot;">
+  <data key="d0">"TECHNOLOGY"</data>
+  <data key="d1">"农业技术是指用于提高作物产量和质量的科学方法，云粳26号的推广依赖于相关的农业技术。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">3.277478752446756e-07</data>
+  <data key="d4">50</data>
+</node>
+<node id="&quot;云南省&quot;">
+  <data key="d0">"LOCATION"</data>
+  <data key="d1">"云南省是中国的一个省份，云粳26号在该省的农业研究和推广中发挥了重要作用。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">3.128163225886591e-07</data>
+  <data key="d4">44</data>
+</node>
+<node id="&quot;云南省农业科学院粮食作物研究所&quot;">
+  <data key="d0">"ORGANIZATION"</data>
+  <data key="d1">"云南省农业科学院粮食作物研究所是一个科研机构，负责育成和研究粮食作物，包括早熟品种云粳26号。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">8.336807513843113e-05</data>
+  <data key="d4">69</data>
+</node>
+<node id="&quot;农业推广&quot;">
+  <data key="d0">"MISSION"</data>
+  <data key="d1">"农业推广是指将新技术和新品种引入生产实践的过程，云粳26号被列为主导品种以促进其推广。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">7.628287841247848e-05</data>
+  <data key="d4">51</data>
+</node>
+<node id="&quot;食用价值&quot;">
+  <data key="d0">"CONCEPT"</data>
+  <data key="d1">"食用价值是指食品在营养和口感方面的综合评价，云粳26号因其食味品质好而具有较高的食用价值。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">2.997547934514262</data>
+  <data key="d4">58</data>
+</node>
+<node id="&quot;农业部&quot;">
+  <data key="d0">"ORGANIZATION"</data>
+  <data key="d1">"农业部是中国的一个政府机构，负责农业政策的制定和推广，云粳26号被其列为主导品种。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.08804169791274698</data>
+  <data key="d4">48</data>
+</node>
+<node id="&quot;推广主导品种&quot;">
+  <data key="d0">"CONCEPT"</data>
+  <data key="d1">"推广主导品种是指在特定区域内被推荐种植的主要作物品种，云粳26号在西南稻区被列为此类品种。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.24357749228919026</data>
+  <data key="d4">55</data>
+</node>
+<node id="&quot;中海拔稻区&quot;">
+  <data key="d0">"LOCATION"</data>
+  <data key="d1">"中海拔稻区是指海拔在1500至1800米之间的稻田区域，适合云粳26号的生长。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">5.926949838261465</data>
+  <data key="d4">45</data>
+</node>
+<node id="&quot;谷壳黄色&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"谷壳黄色是云粳26号的外观特征之一，表明其成熟状态。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.4537467972212248</data>
+  <data key="d4">35</data>
+</node>
+<node id="&quot;高抗稻瘟病&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"高抗稻瘟病是云粳26号的抗病性特征，增强了其种植的可行性。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.010406265234302301</data>
+  <data key="d4">45</data>
+</node>
+<node id="&quot;落粒性适中&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"落粒性适中是云粳26号的特性之一，影响其收割和加工过程。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.39751120253379707</data>
+  <data key="d4">36</data>
+</node>
+<node id="&quot;有香味&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"有香味是云粳26号的食味品质之一，增加了其市场吸引力。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.1572458000758473</data>
+  <data key="d4">36</data>
+</node>
+<node id="&quot;稻瘟病&quot;">
+  <data key="d0">"NATURE"</data>
+  <data key="d1">"稻瘟病是一种影响水稻的病害，云粳26号具有高抗稻瘟病的特性，增强了其种植的可行性。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">2.880133332981227e-07</data>
+  <data key="d4">63</data>
+</node>
+<node id="&quot;外观特点&quot;">
+  <data key="d0">"CONCEPT"</data>
+  <data key="d1">"外观特点是指云粳26号的视觉特征，包括颖尖、谷壳颜色等，影响其市场接受度。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">1.668849735265512</data>
+  <data key="d4">51</data>
+</node>
+<node id="&quot;食味品质好&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"食味品质好是云粳26号的一个重要特性，影响消费者的选择。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.11790494848439569</data>
+  <data key="d4">33</data>
+</node>
+<node id="&quot;云粳26号&quot;">
+  <data key="d0">"WORK"</data>
+  <data key="d1">"云粳26号是由云南省农业科学院粮食作物研究所于2005年育成的早熟稻品种，具有良好的外观和食味品质。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.4054990190854595</data>
+  <data key="d4">68</data>
+</node>
+<node id="&quot;香味&quot;">
+  <data key="d0">"NATURE"</data>
+  <data key="d1">"香味是指米粒在烹饪后散发的气味，云粳26号的米粒具有良好的香味，增加了其市场吸引力。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">1.704282376131091</data>
+  <data key="d4">60</data>
+</node>
+<node id="&quot;2012年&quot;">
+  <data key="d0">"DATE"</data>
+  <data key="d1">"2012年是云粳26号被农业部列为西南稻区农业推广主导品种的年份。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">1.6109373199087365</data>
+  <data key="d4">38</data>
+</node>
+<node id="&quot;2005年&quot;">
+  <data key="d0">"DATE"</data>
+  <data key="d1">"2005年是云粳26号品种育成的年份，标志着该品种的研发开始。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">2.5795454440481884</data>
+  <data key="d4">35</data>
+</node>
+<node id="&quot;米粒大&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"米粒大是云粳26号的一个重要特征，通常与食用价值相关。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.8984304647517591</data>
+  <data key="d4">31</data>
+</node>
+<node id="&quot;落粒性&quot;">
+  <data key="d0">"NATURE"</data>
+  <data key="d1">"落粒性是指稻谷在成熟后从稻穗上脱落的特性，云粳26号的落粒性适中，影响其收割和加工的便利性。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.32474459712132514</data>
+  <data key="d4">66</data>
+</node>
+<node id="&quot;早熟品种&quot;">
+  <data key="d0">"CONCEPT"</data>
+  <data key="d1">"早熟品种是指生长周期较短的作物品种，云粳26号属于这一类，适合在特定气候条件下种植。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.0018348240895041633</data>
+  <data key="d4">55</data>
+</node>
+<node id="&quot;云南中海拔 1 500∼1 800 M 稻区&quot;">
+  <data key="d0">"LOCATION"</data>
+  <data key="d1">"云南中海拔 1 500∼1 800 m 稻区是云粳26号适宜种植的区域，提供了特定的生长环境。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.03589417758929121</data>
+  <data key="d4">55</data>
+</node>
+<node id="&quot;颖尖无色、无芒&quot;">
+  <data key="d0">"KEYWORD"</data>
+  <data key="d1">"颖尖无色、无芒是云粳26号的外观特点之一，影响其市场接受度。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.7739797890026089</data>
+  <data key="d4">39</data>
+</node>
+<node id="&quot;米粒&quot;">
+  <data key="d0">"NATURE"</data>
+  <data key="d1">"米粒是稻谷经过加工后得到的食用部分，云粳26号的米粒大且有香味，提升了其食用价值。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">7.555004360466345e-05</data>
+  <data key="d4">54</data>
+</node>
+<node id="&quot;适宜种植区域&quot;">
+  <data key="d0">"CONCEPT"</data>
+  <data key="d1">"适宜种植区域是指云粳26号推荐的种植环境，主要为云南中海拔 1 500∼1 800 m 稻区。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.4086719057062533</data>
+  <data key="d4">58</data>
+</node>
+<node id="&quot;西南稻区&quot;">
+  <data key="d0">"LOCATION"</data>
+  <data key="d1">"西南稻区是指中国西南地区，云粳26号在此区域被推广。"</data>
+  <data key="d2">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d3">0.004633279566562749</data>
+  <data key="d4">31</data>
+</node>
+<edge source="&quot;农业技术&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号的推广依赖于相关的农业技术，以提高其种植效率和产量。"</data>
+  <data key="d7">4.469515500104365e-07</data>
+  <data key="d8">39</data>
+</edge>
+<edge source="&quot;云南省&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号是云南省育成的品种，适合在该省的特定环境中种植。"</data>
+  <data key="d7">0.0005762292913028245</data>
+  <data key="d8">38</data>
+</edge>
+<edge source="&quot;云南省&quot;" target="&quot;云南省农业科学院粮食作物研究所&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云南省农业科学院粮食作物研究所位于云南省，专注于该省的粮食作物研究和育种工作。"</data>
+  <data key="d7">0.0004066592991875774</data>
+  <data key="d8">55</data>
+</edge>
+<edge source="&quot;云南省农业科学院粮食作物研究所&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号是由云南省农业科学院粮食作物研究所育成的早熟稻品种，体现了该机构的科研成果。"</data>
+  <data key="d7">8.201402306440362e-05</data>
+  <data key="d8">60</data>
+</edge>
+<edge source="&quot;农业推广&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号被列为主导品种，旨在通过农业推广提高其种植和消费。"</data>
+  <data key="d7">0.0016328476835250948</data>
+  <data key="d8">37</data>
+</edge>
+<edge source="&quot;食用价值&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号因其食味品质好而具有较高的食用价值，适合市场需求。"</data>
+  <data key="d7">9.018137544091494e-05</data>
+  <data key="d8">39</data>
+</edge>
+<edge source="&quot;农业部&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号在2012年被农业部列为西南稻区农业推广主导品种，显示了其在农业政策中的重要性。"</data>
+  <data key="d7">0.0660654296723789</data>
+  <data key="d8">51</data>
+</edge>
+<edge source="&quot;推广主导品种&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号在西南稻区被列为推广主导品种，显示其在该地区的重要性。"</data>
+  <data key="d7">0.004850300392718063</data>
+  <data key="d8">36</data>
+</edge>
+<edge source="&quot;中海拔稻区&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"中海拔稻区是云粳26号推荐的种植区域，提供了适合其生长的气候条件。"</data>
+  <data key="d7">0.4154816437684379</data>
+  <data key="d8">44</data>
+</edge>
+<edge source="&quot;谷壳黄色&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"谷壳黄色是云粳26号的外观特征之一，影响其消费者的接受度。"</data>
+  <data key="d7">0.055662130155681604</data>
+  <data key="d8">39</data>
+</edge>
+<edge source="&quot;高抗稻瘟病&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"高抗稻瘟病是云粳26号的抗病性特征，增强了其种植的可行性。"</data>
+  <data key="d7">0.007417909388497926</data>
+  <data key="d8">45</data>
+</edge>
+<edge source="&quot;落粒性适中&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"落粒性适中是云粳26号的特性之一，影响其收割和加工的便利性。"</data>
+  <data key="d7">0.04371501902711867</data>
+  <data key="d8">39</data>
+</edge>
+<edge source="&quot;有香味&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"有香味是云粳26号的食味品质之一，增加了其市场吸引力。"</data>
+  <data key="d7">0.238045756594794</data>
+  <data key="d8">36</data>
+</edge>
+<edge source="&quot;稻瘟病&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号具有高抗稻瘟病的特性，表明其在种植过程中能够抵御这一病害。"</data>
+  <data key="d7">0.006100528949900764</data>
+  <data key="d8">51</data>
+</edge>
+<edge source="&quot;外观特点&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号的外观特点包括颖尖无色、无芒等，影响其市场表现和消费者选择。"</data>
+  <data key="d7">0.4906831717572373</data>
+  <data key="d8">43</data>
+</edge>
+<edge source="&quot;食味品质好&quot;" target="&quot;云粳26号&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"食味品质好是云粳26号的一个重要特性，影响消费者的选择。"</data>
+  <data key="d7">0.013361831643390068</data>
+  <data key="d8">33</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;适宜种植区域&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植，表明其对环境的适应性。"</data>
+  <data key="d7">0.39592594481115617</data>
+  <data key="d8">52</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;云南中海拔 1 500∼1 800 M 稻区&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植，表明其生长环境的要求。"</data>
+  <data key="d7">0.5030086991360689</data>
+  <data key="d8">51</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;米粒大&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"米粒大是云粳26号的重要特征，通常与其食用价值相关。"</data>
+  <data key="d7">0.14715079462621358</data>
+  <data key="d8">31</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;早熟品种&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号是一种早熟品种，适合在特定的气候条件下种植，具有较短的生长周期。"</data>
+  <data key="d7">0.008795430480896508</data>
+  <data key="d8">49</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;2005年&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号于2005年育成，标志着该品种的研发历程。"</data>
+  <data key="d7">2.4413040461122195</data>
+  <data key="d8">31</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;香味&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号的香味是其食用品质的重要组成部分，吸引了消费者的关注。"</data>
+  <data key="d7">2.379013142252285e-06</data>
+  <data key="d8">40</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;颖尖无色、无芒&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"颖尖无色、无芒是云粳26号的外观特点，影响其市场表现。"</data>
+  <data key="d7">2.96823331740096</data>
+  <data key="d8">35</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;落粒性&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号的落粒性适中，影响其在收割时的表现和加工效率。"</data>
+  <data key="d7">0.11254188650424042</data>
+  <data key="d8">36</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;2012年&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"2012年云粳26号被农业部列为西南稻区农业推广主导品种，标志着其推广的重要性。"</data>
+  <data key="d7">0.26297679025</data>
+  <data key="d8">46</data>
+</edge>
+<edge source="&quot;云粳26号&quot;" target="&quot;米粒&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"云粳26号的米粒大且有香味，提升了其在市场上的竞争力。"</data>
+  <data key="d7">0.0008942391715159215</data>
+  <data key="d8">37</data>
+</edge>
+<edge source="&quot;2012年&quot;" target="&quot;西南稻区&quot;">
+  <data key="d5">chunk-59c892b0b8d999d3057866424ac38c5f</data>
+  <data key="d6">"2012年云粳26号在西南稻区被推广，表明其在该地区的适应性和重要性。"</data>
+  <data key="d7">0.001021465373999705</data>
+  <data key="d8">39</data>
+</edge>
+</graph></graphml>
diff --git a/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg b/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg
new file mode 100644
index 00000000..2d9e1e8e
Binary files /dev/null and b/examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg differ
diff --git a/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg b/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg
new file mode 100644
index 00000000..d1872ed4
Binary files /dev/null and b/examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg differ
diff --git a/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg b/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg
new file mode 100644
index 00000000..66e790df
Binary files /dev/null and b/examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg differ
diff --git a/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg b/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg
new file mode 100644
index 00000000..755a0bd7
Binary files /dev/null and b/examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg differ
diff --git a/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg b/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg
new file mode 100644
index 00000000..59c133a6
Binary files /dev/null and b/examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg differ
diff --git a/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg b/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg
new file mode 100644
index 00000000..f2de8c08
Binary files /dev/null and b/examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg differ
diff --git a/examples/input_examples/json_demo.json b/examples/input_examples/json_demo.json
new file mode 100644
index 00000000..a53101da
--- /dev/null
+++ b/examples/input_examples/json_demo.json
@@ -0,0 +1,6 @@
+[
+  {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
+  {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"},
+  {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
+  {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
+]
diff --git a/examples/input_examples/jsonl_demo.jsonl b/examples/input_examples/jsonl_demo.jsonl
new file mode 100644
index 00000000..2b743578
--- /dev/null
+++ b/examples/input_examples/jsonl_demo.jsonl
@@ -0,0 +1,4 @@
+{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
+{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
+{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
+{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
diff --git a/examples/input_examples/pdf_demo.pdf b/examples/input_examples/pdf_demo.pdf
new file mode 100644
index 00000000..5a328e4c
Binary files /dev/null and b/examples/input_examples/pdf_demo.pdf differ
diff --git a/examples/input_examples/search_dna_demo.jsonl b/examples/input_examples/search_dna_demo.jsonl
new file mode 100644
index 00000000..f423e1c1
--- /dev/null
+++ b/examples/input_examples/search_dna_demo.jsonl
@@ -0,0 +1,4 @@
+{"type": "text", "content": "NG_033923"}
+{"type": "text", "content": "NG_056118"}
+{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
+{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
diff --git a/examples/input_examples/search_protein_demo.jsonl b/examples/input_examples/search_protein_demo.jsonl
new file mode 100644
index 00000000..e119cec8
--- /dev/null
+++ b/examples/input_examples/search_protein_demo.jsonl
@@ -0,0 +1,14 @@
+{"type": "text", "content": "P01308"}
+{"type": "text", "content": "P68871"}
+{"type": "text", "content": "P02768"}
+{"type": "text", "content": "P04637"}
+{"type": "text", "content": "insulin"}
+{"type": "text", "content": "hemoglobin"}
+{"type": "text", "content": "p53"}
+{"type": "text", "content": "BRCA1"}
+{"type": "text", "content": "albumin"}
+{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
+{"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"}
+{"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"}
+{"type": "text", "content": "MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR"}
+{"type": "text", "content": "GSHMLCAISGKVPRRPVLSPKSRTIFEKSLLEQYVKDTGNDPITNEPLSIEEIVEIVPSAQ"}
diff --git a/examples/input_examples/search_rna_demo.jsonl b/examples/input_examples/search_rna_demo.jsonl
new file mode 100644
index 00000000..896473e2
--- /dev/null
+++ b/examples/input_examples/search_rna_demo.jsonl
@@ -0,0 +1,8 @@
+{"type": "text", "content": "hsa-let-7a-1"}
+{"type": "text", "content": "XIST regulator"}
+{"type": "text", "content": "URS0000123456"}
+{"type": "text", "content": "URS0000000001"}
+{"type": "text", "content": "URS0000000787"}
+{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
+{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
+{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
diff --git a/examples/input_examples/searched_dna_demo.jsonl b/examples/input_examples/searched_dna_demo.jsonl
new file mode 100644
index 00000000..05778743
--- /dev/null
+++ b/examples/input_examples/searched_dna_demo.jsonl
@@ -0,0 +1,3 @@
+{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
+{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"}
+{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"}
diff --git a/examples/input_examples/searched_protein_demo.jsonl b/examples/input_examples/searched_protein_demo.jsonl
new file mode 100644
index 00000000..47ab02ad
--- /dev/null
+++ b/examples/input_examples/searched_protein_demo.jsonl
@@ -0,0 +1,8 @@
+{"_doc_id":"doc-P01308","type":"protein","content":"Function: ['Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver.']\nSequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P01308","entry_name":"INS_HUMAN","gene_names":[{"Name":"INS"}],"protein_name":"Insulin","organism":"Homo sapiens","sequence":"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","function":["Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver."],"url":"https:\/\/www.uniprot.org\/uniprot\/P01308","_search_query":"P01308"}
+{"_doc_id":"doc-Q6UWZ7","type":"protein","content":"Function: [\"Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}.\"]\nSequence: MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q6UWZ7","entry_name":"ABRX1_HUMAN","gene_names":[{"Name":"ABRAXAS1 {ECO:0000312|HGNC:HGNC:25829}","Synonyms":["ABRA1 {ECO:0000312|HGNC:HGNC:25829}","CCDC98","FAM175A {ECO:0000312|HGNC:HGNC:25829}"],"ORFNames":["UNQ496\/PRO1013"]}],"protein_name":"BRCA1-A complex subunit Abraxas 1 {ECO:0000312|HGNC:HGNC:25829}","organism":"Homo sapiens","sequence":"MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","function":["Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q6UWZ7","_search_query":"BRCA1"}
+{"_doc_id":"doc-P27355","type":"protein","content":"Function: ['Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds.']\nSequence: MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P27355","entry_name":"MEMG_METTR","gene_names":[{"Name":"mmoZ"}],"protein_name":"Methane monooxygenase component A gamma chain","organism":"Methylosinus trichosporium.","sequence":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","function":["Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds."],"url":"https:\/\/www.uniprot.org\/uniprot\/P27355","_search_query":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"}
+{"_doc_id":"doc-Q96GG9","type":"protein","content":"Function: ['Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}.']\nSequence: MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q96GG9","entry_name":"DCNL1_HUMAN","gene_names":[{"Name":"DCUN1D1 {ECO:0000312|HGNC:HGNC:18184}","Synonyms":["DCN1 {ECO:0000303|PubMed:28581483}","DCUN1L1","RP42","SCCRO"]}],"protein_name":"DCN1-like protein 1 {ECO:0000305}","organism":"Homo sapiens","sequence":"MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","function":["Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q96GG9","_search_query":"MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"}
+{"_doc_id":"doc-P68871","type":"protein","content":"Function: ['Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.', 'LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.', '[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation.']\nSequence: MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P68871","entry_name":"HBB_HUMAN","gene_names":[{"Name":"HBB"}],"protein_name":"Hemoglobin subunit beta","organism":"Homo sapiens","sequence":"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","function":["Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.","LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.","[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation."],"url":"https:\/\/www.uniprot.org\/uniprot\/P68871","_search_query":"P68871"}
+{"_doc_id":"doc-P22939","type":"protein","content":"Sequence: MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P22939","entry_name":"ISPA_ECOLI","gene_names":[{"Name":"ispA","OrderedLocusNames":["b0421","JW0411"]}],"protein_name":"Farnesyl diphosphate synthase","organism":"Escherichia coli","sequence":"MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","function":[],"url":"https:\/\/www.uniprot.org\/uniprot\/P22939","_search_query":"MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
+{"_doc_id":"doc-Q8I8V0","type":"protein","content":"Function: ['Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.', '[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.', '[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.']\nSequence: MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q8I8V0","entry_name":"TAD2B_DROME","gene_names":[{"Name":"Ada2b {ECO:0000312|FlyBase:FBgn0037555}","Synonyms":["Ada2S {ECO:0000303|PubMed:12697829}"],"ORFNames":["CG9638 {ECO:0000312|FlyBase:FBgn0037555}"]}],"protein_name":"Transcriptional adapter 2b {ECO:0000312|FlyBase:FBgn0037555}","organism":"Drosophila melanogaster","sequence":"MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","function":["Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.","[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.","[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q8I8V0","_search_query":"p53"}
+{"_doc_id":"doc-P04637","type":"protein","content":"Function: ['Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}.']\nSequence: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P04637","entry_name":"P53_HUMAN","gene_names":[{"Name":"TP53","Synonyms":["P53"]}],"protein_name":"Cellular tumor antigen p53","organism":"Homo sapiens","sequence":"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","function":["Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}."],"url":"https:\/\/www.uniprot.org\/uniprot\/P04637","_search_query":"P04637"}
diff --git a/examples/input_examples/searched_rna_demo.jsonl b/examples/input_examples/searched_rna_demo.jsonl
new file mode 100644
index 00000000..9ad088c0
--- /dev/null
+++ b/examples/input_examples/searched_rna_demo.jsonl
@@ -0,0 +1,6 @@
+{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000123456"}
+{"_doc_id":"doc-URS00000088CC","type":"rna","content":"Description: lncRNA from 1 species\nSequence: GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS00000088CC","rnacentral_id":"URS00000088CC","sequence":"GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","sequence_length":210,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS00000088CC","organism":"Homo sapiens","related_genes":["ENSG00000265458.1","lnc-C17orf62-1","ENSG00000265458","NONHSAG023099","HSALNG0119438","NONHSAG023099.2","ENSG00000265458.4","RP13-20L14.6","NARF-AS2"],"gene_name":"ENSG00000265458, ENSG00000265458.1, ENSG00000265458.4, HSALNG0119438, NARF-AS2, NONHSAG023099, NONHSAG023099.2, RP13-20L14.6, lnc-C17orf62-1","so_term":"antisense, ncRNA","modifications":null,"_search_query":"GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
+{"_doc_id":"doc-URS000342178E","type":"rna","content":"Description: None misc RNA\nSequence: GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS000342178E","rnacentral_id":"URS000342178E","sequence":"GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","sequence_length":73,"rna_type":"misc_RNA","description":"None misc RNA","url":"https:\/\/rnacentral.org\/rna\/URS000342178E","organism":null,"related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"XIST regulator"}
+{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
+{"_doc_id":"doc-URS0000000787","type":"rna","content":"Description: lncRNA from 1 species\nSequence: AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000787","rnacentral_id":"URS0000000787","sequence":"AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","sequence_length":576,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000787","organism":"Homo sapiens","related_genes":["KB-1183D5.13","lnc-GGT2-26","ENSG00000206142.10","ENSG00000206142.9","NONHSAG033362.2","FAM230H","NONHSAG033362","lnc-GGT2-4","ENSG00000206142","lnc-GGT2-2","HSALNG0134219"],"gene_name":"ENSG00000206142, ENSG00000206142.10, ENSG00000206142.9, FAM230H, HSALNG0134219, KB-1183D5.13, NONHSAG033362, NONHSAG033362.2, lnc-GGT2-2, lnc-GGT2-26, lnc-GGT2-4","so_term":"lincRNA, ncRNA","modifications":null,"_search_query":"URS0000000787"}
+{"_doc_id":"doc-URS0000000001","type":"rna","content":"Description: rRNA from 1 species\nSequence: AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000001","rnacentral_id":"URS0000000001","sequence":"AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","sequence_length":200,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000001","organism":"uncultured bacterium","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000000001"}
diff --git a/resources/examples/txt_demo.txt b/examples/input_examples/txt_demo.txt
similarity index 100%
rename from resources/examples/txt_demo.txt
rename to examples/input_examples/txt_demo.txt
diff --git a/examples/input_examples/vqa_demo.json b/examples/input_examples/vqa_demo.json
new file mode 100644
index 00000000..d3aed723
--- /dev/null
+++ b/examples/input_examples/vqa_demo.json
@@ -0,0 +1,93 @@
+[
+    {
+        "type": "text",
+        "content": "The $4 4 - \\mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \\ge } 5 0 0 \\mathbf { n } \\mathbf { M }$ $( \\geq 2 5 ~ | \\mathbf { \\mu } \\mathbf { g } / \\mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and "
+    },
+    {
+        "type": "text",
+        "content": "Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \\mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \\mathrm { \\ m M \\ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). "
+    },
+    {
+        "type": "image",
+        "content":{
+            "img_path": "examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg",
+            "image_caption": [
+            "Fig. 1. (A) Physical map of the hrp gene cluster of E. amylovora (4, 18, 29), showing restriction sites: B, Bam HI; E, Eco RI; H, Hind II. Gene hrpN, encoding harpin, is contained in the 1.3 kb Hind II fragment indicated by the solid bar. The shaded region (including hrpN) contains that part of the hrp gene cluster in which most transposon insertions, exemplified by K49, a Tn10 mini-kan (30) insertion, abolish the HR and pathogenicity phenotypes. Most "
+        ]
+        }
+    },
+    {
+        "type": "text",
+        "content": "HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \\mathrm { \\ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\\mathsf { D H S } \\alpha ( \\mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. "
+    },
+    {
+        "type": "text",
+        "content": "The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\\mathbf { K } ^ { + } / \\mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). "
+    },
+    {
+        "type": "image",
+        "content": {
+            "img_path": "examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg",
+            "image_caption": [
+            "Fig. 2. Tobacco leaf showing responses 24 hours after infitration of sectors (7) with the following preparations: 1,， living E. coli DH5α (pCPP9) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 2, E. coli DH5α (pCPP430) $( 1 \\ \\times \\ 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 3, E. coli DH5α (pCPP430K49) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m } )$ ; 4, E. amylovora Ea321 $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m l } )$ ; 5, Ea321K49, an hrp mutant $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m } )$ , 8, heat-treated CFEP from $\\pmb { \\varepsilon }$ coli ${ \\mathsf { D } } { \\mathsf { H } } { \\mathsf { S } } { \\mathsf {  { \\alpha } } } ( { \\mathsf { P } } { \\mathsf { C } } { \\mathsf { P } } { \\mathsf { P } } { \\mathsf { 9 } } )$ ; 9,heat-treated CFEP from E. coli DH5α(pCPP430); 10, heat-treated CFEP from E. coli DH5α(pCPP430K49); 11, heattreated CFEP from $\\boldsymbol { \\varepsilon }$ amylovora Ea321; 12, heat-treated CFEP from Ea321K49; 6, harpin $( 1 . 1 \\mu M )$ from E. coli DH5α(pCPP430) eluted from SDS-polyacrylamide gel; 7, same preparation as 6, but protease treated for 2 hours then heated for io min to inactivate protease; 13, harpin $( 1 \\pmb { \\mu } \\pmb { M } )$ from E. amylovora Ea321 eluted from SDS-polyacrylamide gel; 14, same preparation as 13 but with protease treatment as sample 7. Harpin solutions $< - 0 . 3 \\mu \\mathsf { m }$ do not cause collapse of infitrated tissue; spotty and incomplete collapse is caused by harpin between 0.3 and $0 . 5 ~ { \\mu } \\mathsf { m }$ . "
+        ]
+        }
+    },
+    {
+        "type": "text",
+        "content": "We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. "
+    },
+    {
+        "type": "text",
+        "content": "Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \\mathsf { p H } \\thinspace 6 . 5 )$ containing tetracycline (40 $\\mu { \\sf g } / { \\sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\\mathfrak { s } 7 ^ { \\circ } \\mathfrak { C }$ for 5 min, $1 0 0 ~ \\mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \\%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \\mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). "
+    },
+    {
+        "type": "table",
+        "content": {
+            "img_path": "examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg",
+            "table_caption": [],
+            "table_footnote": [],
+            "table_body": "<table><tr><td>Protease per milliter</td><td>Tissue collapse</td><td>Harpin detected</td></tr><tr><td>0</td><td>+</td><td>+</td></tr><tr><td>5μg</td><td>+</td><td>+</td></tr><tr><td>10μg</td><td>+</td><td>+</td></tr><tr><td>20 μg</td><td>Weak</td><td>+</td></tr><tr><td>40 μg</td><td>-</td><td></td></tr><tr><td>80μg</td><td>一</td><td></td></tr><tr><td>80μg + 0.5 mM PMSF</td><td>+</td><td>+</td></tr><tr><td>Cell-free supernatant</td><td></td><td></td></tr></table>"
+        }
+    },
+    {
+        "type": "text",
+        "content": "expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo"
+    },
+    {
+        "type": "image",
+        "content": {
+            "img_path": "examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg",
+            "image_caption": [
+                "Fig. 3. SDS-polyacrylamide gel electrophoresis of CFEPs and purified harpin. Lanes: 1, purified harpin $( 1 . 5 \\ \\mathsf { \\pmb { \\mu } } \\mathsf { \\pmb { \\mathsf { g } } } )$ from E. coli $\\mathsf { D M } 5 \\alpha ( \\mathsf { p C P } 4 3 0 )$ incubated with protease (9) for 1 hour; 2, purified harpin $( 1 . 5 \\mu \\mathfrak { g } )$ from E. amylovora Ea321 incubated with protease for 1 hour; 3, same as 1, but without treatment with protease; 4, same as 2, but without treatment with protease; 5, CFEP (5 ${ \\pmb { \\mu } } ( { \\pmb q } )$ from E. coli DH5α(pCPP9) treated at $1 0 0 ^ { \\circ } \\mathbb { C }$ for 10'min; 6, CFEP $( 5 \\ \\pmb { \\mu } \\pmb { \\mu } )$ from E. coli DH5a(pCPP430K49) treated at $\\pmb { 1 0 0 } \\pmb { \\circ } \\pmb { \\subset }$ for 10 min; 7, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321 treated "
+            ]
+        }
+    },
+    {
+        "type": "text",
+        "content": "at $\\pmb { 1 0 0 ^ { \\circ } } \\pmb { \\mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \\mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \\circ } \\mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \\circ } \\mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \\mathsf { p H } 6 . 8 )$ $4 \\%$ SDS, $20 \\%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \\%$ (w/v) polyacrylamide gel with $0 . 1 \\%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \\%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \\ k \\mathsf$ "
+    },
+    {
+        "type": "image",
+        "content": {
+            "img_path": "examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg",
+            "image_caption": [
+                "Fig. 4. Subcellular location of elicitor protein. Logphase cells $( 1 . 5 m )$ of strain Ea321(pCPP430) were fractionated (31). Proteins from each fraction were electrophoresed and transferred to Immobilon-P membrane (Millipore, Bedford, Massachusetts). The Amplified Alkaline Phosphatase Immuno-Blot Assay Kit (170-6412, Bio-Rad Richmond, California) was ",
+                "used in a Western blot to detect the elicitor protein with an antiserum raised in rabbit in response to harpin (15). (A) Fractions in lanes: 1, periplasm; 2, membrane; 3, whole cells; 4, supernatant; 5, cytoplasm. (B) Harpin purified by high-performance liquid chromatography (19) hybridized with antiserum. Arrows indicates $4 4 \\ k \\mathsf { D }$ based on the molecular weight markers used in Fig. 3. (C) Normal serum control. CFEP from E. coli DH5a(pCPP430) hybridized with pre-immune serum. "
+            ]
+        }
+    },
+    {
+        "type": "text",
+        "content": "DNA sequence data from the $1 . 3 – \\mathbf { k } \\mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. "
+    },
+    {
+        "type": "image",
+        "content": {
+            "img_path": "examples/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg",
+            "image_caption": [
+                "Fig. 5. Changes in pH of bathing solution of tobacco cell-suspension cultures (TCSC). Control values (no additive) were subtracted. Open squares, harpin (60 nM); open circles, cells of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per milliliter); filled squares, cells of E. amylovora Ea321 $( 5 \\times 1 0 ^ { 7 }$ cells per milliiter); triangles, cells of E. coli DH5α(pCPP430K49) $( 5 \\times 1 0 ^ { 7 }$ cells per milliter); diamonds, cells of $\\boldsymbol { \\varepsilon }$ amylovora Ea321K49 $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per  milliter); filled circles, cells of $\\boldsymbol { E } .$ coli DH5α(pCPP9) $( 5 \\times$ $\\pmb { 1 0 ^ { 6 } }$ cells per mililiter). TCSCs were shaken at room temperature with the indicated preparations. The pH was measured at the intervals indicated. All preparations that elicited HR in tobacco leaves (Fig. 2) also caused a pH increase in the TCSC medium. "
+            ]
+        }
+    }
+]
\ No newline at end of file
diff --git a/examples/output_examples/aggregated_chatml.json b/examples/output_examples/aggregated_chatml.json
new file mode 100644
index 00000000..1561a6d0
--- /dev/null
+++ b/examples/output_examples/aggregated_chatml.json
@@ -0,0 +1,98 @@
+[
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What laboratory technique was used to study the localization of TAC4 in NB epidermal cells?"
+            },
+            {
+                "role": "assistant",
+                "content": "The transient assay, a laboratory technique utilized to introduce genetic material into cells for a limited duration, was conducted in NB epidermal cells. This approach aimed to investigate the localization of TAC4 within these cells."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What was the focus of the study regarding TAC4-RFP fusion proteins, and how did researchers visualize their localization?"
+            },
+            {
+                "role": "assistant",
+                "content": "The study focused on the localization of TAC4-RFP fusion proteins, which were primarily found in the cell wall, the rigid outer layer of plant cells that provides structural support and protection. To visualize the localization of these proteins, RFP signals, emitted by the Red Fluorescent Protein, were employed. This use of RFP signals allowed researchers to effectively track the presence of TAC4-RFP fusion proteins within the cell wall, highlighting their functional role in that specific area."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号是什么品种，它的育成背景和适应性如何？"
+            },
+            {
+                "role": "assistant",
+                "content": "云粳26号是一种由云南省农业科学院粮食作物研究所于2005年育成的早熟稻品种。该品种不仅具有良好的外观，还在食味品质上表现出色，因而其食用价值较高，吸引了众多消费者的关注。\n\n首先，云粳26号适宜在云南中海拔稻区种植，这表明它对该环境具有良好的适应性。稻区是指适合水稻种植的区域，而云粳26号的成功种植进一步体现了云南省农业科学院在水稻研究和品种选育方面的重要作用。因此，云粳26号的推广不仅有助于提升当地的农业生产水平，也为消费者提供了高品质的稻米选择。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号在西南稻区的农业推广中扮演了什么样的角色？"
+            },
+            {
+                "role": "assistant",
+                "content": "西南稻区是一个重要的农业区域，其中云粳26号被列为农业推广的主导品种。2012年，这一品种正式被农业部认定为西南稻区的主导品种，这一决定突显了云粳26号在该区域的重要性。主导品种是指在特定区域内被广泛推广和种植的农作物品种，因此，云粳26号作为西南稻区的主导品种，反映了其良好的适应性以及市场的需求。这一系列的推广和认定，表明了云粳26号在西南稻区农业发展中的关键角色。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is the role of the TAC4 gene in rice plant architecture and how does its function impact the tiller angle and overall growth?"
+            },
+            {
+                "role": "assistant",
+                "content": "The loss of TAC4 function results in a notable increase in the tiller angle of rice plants, which significantly impacts their overall architecture. TAC4 is a plant-specific gene that plays a crucial role in regulating rice shoot gravitropism, a vital process for proper plant orientation and growth. Sequence analysis of TAC4 has revealed that it underwent a genetic bottleneck during the domestication of indica cultivars, leading to a reduction in its genetic diversity.\n\nFurthermore, TAC4 influences the content of indole acetic acid, a type of auxin that is essential for regulating various aspects of plant growth and development, including gravitropism. The function of TAC4 is critical for the regulation of tiller angle, as it encodes a nuclear protein that is integral to this process. By affecting the distribution of auxin, TAC4 contributes to the regulatory mechanisms that govern plant architecture.\n\nResearch on TAC4 not only enhances our understanding of these regulatory mechanisms but also provides insights that could facilitate the improvement of rice plant architecture. The highly conserved nature of the TAC4 gene underscores its importance across different plant species. Additionally, the BG1 gene, which is influenced by auxin, plays a role in gravitropism and subsequently affects the growth and orientation of rice grains. Ultimately, the tiller angle is a key factor that directly influences the grain yield of rice, highlighting the interconnectedness of these genetic and physiological processes."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号是什么品种，它的主要特点和适应性如何？"
+            },
+            {
+                "role": "assistant",
+                "content": "云粳26号是一种早熟稻品种，由云南省农业科学院粮食作物研究所于2005年育成。这一品种的研发标志着其在水稻研究领域的重要起始时间。云粳26号具有多项外观特点，其中包括颖尖无色、无芒和谷壳黄色，这些特征不仅体现了其独特性，还直接影响了其市场表现。\n\n此外，云粳26号的米粒较大，食味品质优良，并且具有香味，这些特性显著提升了其食用价值和市场吸引力。值得一提的是，云粳26号还具备高抗稻瘟病的特性，使其在稻区种植时能够有效减少病害损失，适合在云南中海拔1,500至1,800米的稻区种植。\n\n在2012年，云粳26号被农业部列为西南稻区农业推广的主导品种，这一认可进一步强调了其在农业政策中的重要性。云粳26号的落粒性适中，适应性强，能够在中海拔地区良好生长，反映了其对特定气候和土壤条件的适应能力。因此，云粳26号不仅是一个优质的早熟品种，也是云南省在水稻研究和推广方面的重要成果。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "隆两优1212水稻品种的引入和试种情况如何？请详细描述其生长特性、产量以及推广情况。"
+            },
+            {
+                "role": "assistant",
+                "content": "隆两优1212是一种水稻品种，于2017年在福建省龙岩市长汀县引入进行试种。该品种在圣丰家庭农场的河田镇南塘村进行试种，表现出优良的生长特性和高产量。隆两优1212的成熟时间为10月21日，标志着可以收获。其产量达到8.78 t/hm^2，显示出其高产特性。\n\n隆两优1212具有多项优良特性，包括抗性好、后期转色好、分蘖力强、适应性好、生育期适中、抽穗整齐等。这些特性使其在种植过程中能够有效抵抗病虫害，减少损失，并在市场上更具吸引力。此外，隆两优1212的适应性好使其能够在不同的生长环境中良好生长，而生育期适中则使其适合当地的气候条件。\n\n在种植过程中，隆两优1212采用了机插和机播技术，这不仅提高了插秧的质量，也提升了种植效率。隆两优1212作为烟后稻进行种植，符合当地的种植习惯。2018年和2019年，隆两优1212的示范面积分别扩大至4.00 hm^2和13.50 hm^2，显示出其推广潜力。在2018年，隆两优1212在润丰优质稻专业合作社进行示范种植，而在2019年则在绿丰优质稻专业合作社进行示范种植，进一步推广该品种。\n\n总的来说，隆两优1212在长汀县的试种和示范种植中，凭借其优良的生长特性和高产量，展现了良好的市场前景。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is the role of the BIG GRAIN1 (BG1) gene in rice, and how does its manipulation affect grain development and yield?"
+            },
+            {
+                "role": "assistant",
+                "content": "The gene known as BIG GRAIN1 (BG1) plays a crucial role in regulating auxin transport and response, acting as a positive regulator that enhances the plant's ability to respond to this important hormone. BG1 is preferentially expressed in vascular tissue, which is essential for nutrient transport and grain development. In rice (Oryza sativa L.), manipulation of the BG1 gene can lead to significant improvements in plant biomass, seed weight, and overall yield.\n\nSpecifically, the expression of BG1 is linked to increased seed weight, highlighting its importance in grain development. The dominant mutant known as BIG GRAIN1 (BG1-D) is associated with the BG1 gene and is characterized by an extra-large grain phenotype, which affects both grain size and gravitropism. Furthermore, the regulation of BG1 is critical for enhancing yield, making it a key target for genetic improvements aimed at boosting crop productivity.\n\nWhen BG1 expression is reduced through a genetic technique known as knockdown, the result is smaller grains and decreased sensitivities to auxin, underscoring BG1's vital role in regulating grain size. Additionally, the sensitivity of the BG1-D mutant to the auxin transport inhibitor N-1-naphthylphthalamic acid indicates a significant connection between BG1 and auxin transport mechanisms.\n\nOverexpression of BG1 has been shown to lead to larger grain sizes, further demonstrating its influence on grain development. The expression of BG1 in the culms, or stems, of the rice plant suggests its involvement in overall growth and structural integrity. Moreover, BG1 is integral to the regulation of auxin transport, which is essential for proper plant growth and the determination of grain size.\n\nThe BG1 gene also affects the observable characteristics of grain phenotype, particularly in terms of size and response to environmental signals. Its expression in young panicles, the flowering parts of the rice plant, indicates its critical role during the early stages of grain development. Additionally, BG1 influences the plant's sensitivities to auxin and auxin transport inhibitors.\n\nResearch on BG1 has been conducted in both rice and the model plant Arabidopsis, emphasizing its importance in plant development. By manipulating BG1, researchers can achieve increased plant biomass, further illustrating its significance in overall plant growth. The big grain1 (BG1-D) mutant, identified from a T-DNA insertion population, exemplifies the application of genetic tools in plant research and showcases the potential for altering grain size and gravitropism in rice."
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/examples/output_examples/atomic_alpaca.json b/examples/output_examples/atomic_alpaca.json
new file mode 100644
index 00000000..de4bca21
--- /dev/null
+++ b/examples/output_examples/atomic_alpaca.json
@@ -0,0 +1,52 @@
+[
+    {
+        "instruction": "What do regulatory mechanisms control in plants?",
+        "input": "",
+        "output": "Regulatory mechanisms control the development and function of plant traits, such as tiller angle."
+    },
+    {
+        "instruction": "What is the primary focus of genetic studies on rice?",
+        "input": "",
+        "output": "The primary focus of genetic studies on rice is to improve grain yield and size."
+    },
+    {
+        "instruction": "What is the function of the cell wall in plant cells?",
+        "input": "",
+        "output": "The cell wall provides structural support and protection to plant cells."
+    },
+    {
+        "instruction": "What is the result of overexpressing a gene like BG1?",
+        "input": "",
+        "output": "Overexpressing a gene like BG1 leads to enhanced traits such as larger grain size."
+    },
+    {
+        "instruction": "云粳26号是由哪个机构育成的？",
+        "input": "",
+        "output": "云粳26号是由云南省农业科学院育成的。"
+    },
+    {
+        "instruction": "农业推广的定义是什么？",
+        "input": "",
+        "output": "农业推广是指通过政策和技术手段，促进优质农作物的种植和应用。"
+    },
+    {
+        "instruction": "What is the purpose of the Red Fluorescent Protein (RFP) in cellular studies?",
+        "input": "",
+        "output": "The Red Fluorescent Protein (RFP) is used to visualize protein localization in cells by emitting fluorescent signals."
+    },
+    {
+        "instruction": "隆两优1212的产量是多少？",
+        "input": "",
+        "output": "隆两优1212的产量为8.78 t/hm^2。"
+    },
+    {
+        "instruction": "云粳26号适宜种植的海拔范围是多少？",
+        "input": "",
+        "output": "云粳26号适宜种植的海拔范围是1 500∼1 800 m。"
+    },
+    {
+        "instruction": "云粳26号的食味品质中有哪些特点？",
+        "input": "",
+        "output": "有香味是云粳26号的食味品质之一，增加了其市场吸引力。"
+    }
+]
\ No newline at end of file
diff --git a/examples/output_examples/cot_sharegpt.json b/examples/output_examples/cot_sharegpt.json
new file mode 100644
index 00000000..1ae8cd95
--- /dev/null
+++ b/examples/output_examples/cot_sharegpt.json
@@ -0,0 +1,122 @@
+[
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "What is the localization of TAC4-RFP fusion proteins in relation to the cell wall?"
+            },
+            {
+                "from": "gpt",
+                "value": "The entity \"TAC4-RFP FUSION PROTEINS\" is identified as the subject of interest regarding localization. The relationship that indicates the localization of \"TAC4-RFP FUSION PROTEINS\" to \"CELL-WALL\" is established, showing that these proteins are primarily found in that area. This localization suggests a functional role for \"TAC4-RFP FUSION PROTEINS\" within the \"CELL-WALL\". Therefore, the findings indicate that \"TAC4-RFP FUSION PROTEINS\" are localized to the cell wall, highlighting their importance in that specific cellular context."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "云粳26号在西南稻区被列为主导品种的年份是什么？"
+            },
+            {
+                "from": "gpt",
+                "value": "云粳26号是一个在西南稻区被广泛推广的农作物品种。为了找到云粳26号在西南稻区被列为主导品种的年份，首先需要确认云粳26号与西南稻区的关系。根据信息，云粳26号在西南稻区被列为农业推广主导品种。接下来，查找与西南稻区相关的年份信息。根据提供的数据，2012年是云粳26号被列为西南稻区农业推广主导品种的年份。最后，验证2012年与云粳26号的关系，确认2012年确实是云粳26号在西南稻区被列为主导品种的年份。因此，答案是2012年。"
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "What is the purpose of performing a transient assay in NB epidermal cells?"
+            },
+            {
+                "from": "gpt",
+                "value": "The entity \"TRANSIENT ASSAY\" is identified, which is a laboratory technique used to introduce genetic material into cells for a short period to study gene expression and protein function. The entity \"NB EPIDERMAL CELLS\" is also identified, which indicates that the transient assay was performed in these cells to study the localization of TAC4. The relationship between \"TRANSIENT ASSAY\" and \"NB EPIDERMAL CELLS\" reveals that the purpose of the assay is to investigate the localization of TAC4 within the NB epidermal cells. The specific purpose extracted from this relationship is to study how TAC4 is localized in these cells. Therefore, the answer is that the purpose of performing a transient assay in NB epidermal cells is to study the localization of TAC4."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "How does the expression of the BG1 gene influence the grain size and development in rice plants?"
+            },
+            {
+                "from": "gpt",
+                "value": "The BG1 gene plays a crucial role in regulating grain size in rice plants. It is involved in the process of auxin transport, which is essential for proper plant growth and development. The expression of BG1 is linked to increased seed weight, indicating that it directly influences the mass of grains produced. Additionally, BG1 affects the sensitivities of the plant to auxin and auxin transport inhibitors, suggesting that its expression levels can alter how rice plants respond to hormonal signals that regulate growth.\n\nIn young panicles, BG1 is preferentially expressed, highlighting its critical role during the early stages of grain development. This expression is vital for ensuring that the grains develop properly and achieve optimal size. Furthermore, overexpression of BG1 has been shown to lead to increased grain size, demonstrating its significant impact on grain development.\n\nFinally, the BG1 gene influences the grain phenotype, particularly in terms of size and how the grains respond to environmental signals. This multifaceted role of BG1 underscores its importance in enhancing grain size and overall yield in rice plants."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "How does the tiller angle influence the grain yield of rice?"
+            },
+            {
+                "from": "gpt",
+                "value": "Recognize entities: \"GRAIN YIELD\" refers to the amount of rice produced per unit area, while \"TILLER ANGLE\" is an important aspect of plant architecture that significantly affects this yield. Identify relationships: The relationship indicates that the tiller angle is a critical factor that directly affects the grain yield of rice. Associate \"GRAIN YIELD\" with its description, noting that it is influenced by various factors, and \"TILLER ANGLE\" with its description, emphasizing its role in plant architecture. Establish the connection between \"TILLER ANGLE\" and \"GRAIN YIELD\" based on the identified relationship, which highlights the direct impact of the tiller angle on the yield. Formulate the question to explore how the tiller angle influences the grain yield of rice, leading to an understanding of the significance of the tiller angle in maximizing rice production."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "What is the relationship between the big grain1 (Bg1-D) mutant and its identification from the T-DNA insertion population in rice?"
+            },
+            {
+                "from": "gpt",
+                "value": "The entity \"BIG GRAIN1 (BG1-D)\" is a dominant mutant in rice that exhibits an extra-large grain phenotype. This mutant is significant in the study of rice genetics due to its impact on grain size and gravitropism. The entity \"T-DNA INSERTION POPULATION\" is a genetic tool used in plant research to create mutations, including the big grain1 (Bg1-D) mutant. The relationship between \"BIG GRAIN1 (BG1-D)\" and \"T-DNA INSERTION POPULATION\" indicates that the Bg1-D mutant was specifically identified from this population, showcasing the application of genetic tools in plant research. Additionally, the connection between \"BIG GRAIN1 (BG1-D)\" and \"RICE\" highlights that this mutant variant is part of the rice species, further contextualizing its relevance in genetic studies aimed at improving rice yield and size. The question regarding the relationship between the big grain1 (Bg1-D) mutant and its identification from the T-DNA insertion population can be answered by confirming that the Bg1-D mutant was indeed identified through the use of the T-DNA insertion population in rice genetics research."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "What role does TAC4 play in the regulation of tiller angle and how does its function relate to the improvement of rice plant architecture?"
+            },
+            {
+                "from": "gpt",
+                "value": "TAC4 is a gene that plays a crucial role in the regulation of tiller angle in rice plants. It encodes a nuclear protein essential for this regulation, indicating its importance in determining plant architecture. The relationships involving TAC4 reveal that it significantly influences tiller angle regulation, which is critical for optimizing plant structure and yield. \n\nResearch on TAC4 also highlights its potential in improving rice plant architecture. By understanding how TAC4 functions, insights can be gained that may lead to enhancements in the structural traits of rice plants, ultimately increasing their adaptability and yield. \n\nFurthermore, TAC4 affects the distribution of auxin, a plant hormone that regulates various aspects of growth and development, including gravitropism. Specifically, TAC4 regulates rice shoot gravitropism by increasing the content of indole acetic acid, a type of auxin. This connection underscores the role of TAC4 in facilitating proper plant orientation and growth.\n\nIn summary, TAC4 is integral to the regulation of tiller angle, which directly impacts rice plant architecture. Its function not only influences the plant's structural traits but also provides a pathway for potential improvements in rice cultivation practices."
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "哪些农作物适合在中海拔地区种植？"
+            },
+            {
+                "from": "gpt",
+                "value": "中海拔地区的特征主要是气候和土壤条件适合特定农作物的生长。接下来，需要查找适宜种植区域与中海拔的关系，以确定哪些农作物能够在此生长。通过识别适合在中海拔种植的农作物品种，可以进一步了解这些作物的特性。分析这些农作物的外观特点，有助于了解它们在市场上的接受度。最后，确定早熟品种在中海拔地区的适应性，可以为农民提供更好的种植选择。"
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "隆两优1212在长汀县的试种表现出哪些优良特性？"
+            },
+            {
+                "from": "gpt",
+                "value": "隆两优1212在长汀县进行试种，表现出优良的生长特性。首先，隆两优1212的分蘖力强，使其能够产生较多的分蘖，适合在长汀县推广种植。其次，隆两优1212的抗性好，能够有效抵抗病虫害，减少种植过程中的损失。此外，隆两优1212的生育期适中，符合当地的气候条件，促进了其健康生长。最后，隆两优1212的抽穗整齐和后期转色好，进一步提高了其市场竞争力和吸引力。因此，隆两优1212在长汀县的试种表现出分蘖力强、抗性好、生育期适中、抽穗整齐和后期转色好的优良特性。"
+            }
+        ]
+    },
+    {
+        "conversations": [
+            {
+                "from": "human",
+                "value": "云粳26号的外观特点有哪些？"
+            },
+            {
+                "from": "gpt",
+                "value": "云粳26号的外观特点有哪些？  \n首先，我需要查询云粳26号的外观特点。根据信息，云粳26号的外观特点包括“颖尖无色、无芒”和“谷壳黄色”。接下来，我验证“颖尖无色、无芒”是否为云粳26号的外观特点。确实，颖尖无色、无芒是云粳26号的外观特点之一，体现了其独特性。然后，我再验证“谷壳黄色”是否为云粳26号的外观特点。谷壳黄色也是云粳26号的外观特征，影响其市场表现。因此，云粳26号的外观特点包括颖尖无色、无芒和谷壳黄色。"
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/examples/output_examples/multi-hop_chatml.json b/examples/output_examples/multi-hop_chatml.json
new file mode 100644
index 00000000..9e651ae3
--- /dev/null
+++ b/examples/output_examples/multi-hop_chatml.json
@@ -0,0 +1,122 @@
+[
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What architectural feature of rice plants is crucial for maximizing the amount of rice produced per unit area?"
+            },
+            {
+                "role": "assistant",
+                "content": "Tiller angle"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "How does the manipulation of the BG1 gene in Arabidopsis affect the overall growth of the plant, particularly in terms of biomass?"
+            },
+            {
+                "role": "assistant",
+                "content": "It leads to increased plant biomass, indicating its importance in overall plant growth."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "隆两优1212在长汀县表现出哪些特性，使其适合在绿丰优质稻专业合作社进行示范种植？"
+            },
+            {
+                "role": "assistant",
+                "content": "分蘖力强和优良的生长特性。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "In which part of the plant cells were TAC4-RFP fusion proteins primarily localized, and what method was used to visualize their location?"
+            },
+            {
+                "role": "assistant",
+                "content": "The TAC4-RFP fusion proteins were primarily localized to the cell wall, and RFP signals were used to visualize their location."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号的外观特点如何影响其市场接受度和种植选择？"
+            },
+            {
+                "role": "assistant",
+                "content": "云粳26号的外观特点包括颖尖无色、无芒，这些特征影响其市场接受度和种植选择。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号在2012年被列为西南稻区的农业推广主导品种，这一决定反映了什么特征？"
+            },
+            {
+                "role": "assistant",
+                "content": "云粳26号的适应性和市场需求。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What genetic factor in rice is responsible for the extra-large grain phenotype observed in the big grain1 (Bg1-D) mutant, and how does it influence grain size?"
+            },
+            {
+                "role": "assistant",
+                "content": "The BG1 gene is responsible for the extra-large grain phenotype in the big grain1 (Bg1-D) mutant, as it regulates grain size and is involved in auxin transport."
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "云粳26号的高食用价值是由哪个机构的研究成果所体现的？"
+            },
+            {
+                "role": "assistant",
+                "content": "云南省农业科学院"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "隆两优1212在种植过程中采用了哪些技术，这些技术如何影响其市场竞争力？"
+            },
+            {
+                "role": "assistant",
+                "content": "隆两优1212在种植过程中采用了机插和机播技术，这些技术确保了插秧的质量和提高了种植效率，从而提升了其抽穗整齐特性，进而提高了市场竞争力。"
+            }
+        ]
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "How does the overexpression of the BG1 gene influence the growth of rice plants, particularly in relation to grain size and auxin transport?"
+            },
+            {
+                "role": "assistant",
+                "content": "The overexpression of the BG1 gene leads to increased grain size by enhancing auxin transport, which is crucial for regulating growth and development in rice plants. Additionally, BG1 expression in the culms indicates its role in the overall growth and structural integrity of the plant."
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/build_db/build_dna_blast_db.sh
new file mode 100755
index 00000000..f06bf11b
--- /dev/null
+++ b/examples/search/build_db/build_dna_blast_db.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+set -e
+
+# Downloads NCBI RefSeq nucleotide sequences and creates BLAST databases.
+# 
+# RefSeq 目录结构说明（按生物分类组织）：
+#   - vertebrate_mammalian (哺乳动物)
+#   - vertebrate_other (其他脊椎动物)
+#   - bacteria (细菌)
+#   - archaea (古菌)
+#   - fungi (真菌)
+#   - invertebrate (无脊椎动物)
+#   - plant (植物)
+#   - viral (病毒)
+#   - protozoa (原生动物)
+#   - mitochondrion (线粒体)
+#   - plastid (质体)
+#   - plasmid (质粒)
+#   - other (其他)
+#   - complete/ (完整基因组，包含所有分类)
+#
+# 每个分类目录下包含：
+#   - {category}.{number}.genomic.fna.gz (基因组序列)
+#   - {category}.{number}.rna.fna.gz (RNA序列)
+#
+# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all]
+#   human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest)
+#   representative: Download genomic sequences from major categories (recommended, smaller)
+#                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
+#   complete: Download all complete genomic sequences from complete/ directory (very large)
+#   all: Download all genomic sequences from all categories (very large)
+#
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast}
+
+# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+DOWNLOAD_TMP=_downloading_dna
+mkdir -p ${DOWNLOAD_TMP}
+cd ${DOWNLOAD_TMP}
+
+# Download RefSeq release information
+echo "Downloading RefSeq release information..."
+wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || {
+    echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier"
+    RELEASE=$(date +%Y%m%d)
+}
+
+if [ -f "RELEASE_NUMBER" ]; then
+    RELEASE=$(cat RELEASE_NUMBER | tr -d '\n')
+    echo "RefSeq release: ${RELEASE}"
+else
+    RELEASE=$(date +%Y%m%d)
+    echo "Using date as release identifier: ${RELEASE}"
+fi
+
+# Function to check if a file is already downloaded and complete
+check_file_downloaded() {
+    local filename=$1
+    # Since we're already in ${DOWNLOAD_TMP} directory, use filename directly
+    local local_file="${filename}"
+    
+    # Check if compressed file (.fna.gz) exists
+    if [ -f "${local_file}" ] && [ -s "${local_file}" ]; then
+        # Try to verify it's a valid gzip file by attempting to decompress a small portion
+        if gunzip -t "${local_file}" 2>/dev/null; then
+            return 0  # Compressed file exists and is valid
+        else
+            # File exists but is corrupted, remove it
+            rm -f "${local_file}"
+            return 1
+        fi
+    fi
+    
+    # Check if decompressed file (.fna) exists (may have been extracted previously)
+    local decompressed_file="${local_file%.gz}"
+    if [ -f "${decompressed_file}" ] && [ -s "${decompressed_file}" ]; then
+        # Check if it's a valid FASTA file (starts with >)
+        if head -1 "${decompressed_file}" 2>/dev/null | grep -q "^>"; then
+            return 0  # Decompressed file exists and appears valid
+        else
+            # File exists but doesn't look like a valid FASTA file
+            return 1
+        fi
+    fi
+    
+    return 1  # Neither compressed nor decompressed file exists
+}
+
+# Function to check if a file contains target species
+check_file_for_species() {
+    local url=$1
+    local filename=$2
+    local temp_file="/tmp/check_${filename//\//_}"
+    
+    # First check if file is already downloaded locally
+    if check_file_downloaded "${filename}"; then
+        # File already exists, check if it contains target species
+        # Check both compressed and decompressed versions
+        local decompressed_file="${filename%.gz}"
+        if [ -f "${filename}" ]; then
+            # Compressed file exists
+            if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
+                return 0  # Contains target species
+            else
+                return 1  # Does not contain target species
+            fi
+        elif [ -f "${decompressed_file}" ]; then
+            # Decompressed file exists
+            if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
+                return 0  # Contains target species
+            else
+                return 1  # Does not contain target species
+            fi
+        fi
+    fi
+    
+    # File not downloaded yet, download first 500KB to check
+    # Download first 500KB (enough to get many sequence headers)
+    # This should be sufficient to identify the species in most cases
+    if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
+        # Try to decompress and check for species names
+        # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母)
+        if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
+            rm -f "${temp_file}"
+            return 0  # Contains target species
+        else
+            rm -f "${temp_file}"
+            return 1  # Does not contain target species
+        fi
+    else
+        # If partial download fails, skip this file (don't download it)
+        rm -f "${temp_file}"
+        return 1
+    fi
+}
+
+# Download based on type
+case ${DOWNLOAD_TYPE} in
+    human_mouse_drosophila_yeast)
+        echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..."
+        echo "This will check each file to see if it contains target species sequences..."
+        
+        # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母)
+        categories="vertebrate_mammalian invertebrate fungi"
+        total_file_count=0
+        total_download_count=0
+        
+        for category in ${categories}; do
+            echo "Checking files in ${category} category..."
+            
+            # Get list of files and save to temp file to avoid subshell issues
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
+            
+            file_count=0
+            download_count=0
+            
+            while read filename; do
+                file_count=$((file_count + 1))
+                total_file_count=$((total_file_count + 1))
+                url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
+                echo -n "[${total_file_count}] Checking ${category}/${filename}... "
+                
+                if check_file_for_species "${url}" "${filename}"; then
+                    # Check if file is already downloaded
+                    if check_file_downloaded "${filename}"; then
+                        echo "✓ already downloaded (contains target species)"
+                        download_count=$((download_count + 1))
+                        total_download_count=$((total_download_count + 1))
+                    else
+                        echo "✓ contains target species, downloading..."
+                        download_count=$((download_count + 1))
+                        total_download_count=$((total_download_count + 1))
+                        wget -c -q --show-progress "${url}" || {
+                            echo "Warning: Failed to download ${filename}"
+                        }
+                    fi
+                else
+                    echo "✗ skipping (no target species data)"
+                fi
+            done < /tmp/refseq_files_${category}.txt
+            
+            rm -f /tmp/refseq_files_${category}.txt
+            echo "  ${category}: Checked ${file_count} files, downloaded ${download_count} files."
+        done
+        
+        echo ""
+        echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)."
+        ;;
+    representative)
+        echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
+        # Download major categories for representative coverage
+        # Note: You can modify this list based on your specific requirements
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do
+            echo "Downloading ${category} sequences..."
+            # Get list of files and save to temp file to avoid subshell issues
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
+            
+            while read filename; do
+                if check_file_downloaded "${filename}"; then
+                    echo "  ✓ ${filename} already downloaded, skipping..."
+                else
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                fi
+            done < /tmp/refseq_files_${category}.txt
+            
+            rm -f /tmp/refseq_files_${category}.txt
+        done
+        ;;
+    complete)
+        echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..."
+        # Get list of files and save to temp file to avoid subshell issues
+        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
+            grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+            sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt
+        
+        while read filename; do
+            if check_file_downloaded "${filename}"; then
+                echo "  ✓ ${filename} already downloaded, skipping..."
+            else
+                echo "  Downloading ${filename}..."
+                wget -c -q --show-progress \
+                    "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
+                    echo "Warning: Failed to download ${filename}"
+                }
+            fi
+        done < /tmp/refseq_files_complete.txt
+        
+        rm -f /tmp/refseq_files_complete.txt
+        ;;
+    all)
+        echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..."
+        # Download genomic sequences from all categories
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
+            echo "Downloading ${category} genomic sequences..."
+            # Get list of files and save to temp file to avoid subshell issues
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
+            
+            while read filename; do
+                if check_file_downloaded "${filename}"; then
+                    echo "  ✓ ${filename} already downloaded, skipping..."
+                else
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                fi
+            done < /tmp/refseq_files_${category}.txt
+            
+            rm -f /tmp/refseq_files_${category}.txt
+        done
+        ;;
+    *)
+        echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
+        echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]"
+        echo "  human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)"
+        echo "  representative: Download major categories (recommended)"
+        echo "  complete: Download all complete genomic sequences (very large)"
+        echo "  all: Download all genomic sequences (extremely large)"
+        echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
+        exit 1
+        ;;
+esac
+
+cd ..
+
+# Create release directory
+mkdir -p refseq_${RELEASE}
+mv ${DOWNLOAD_TMP}/* refseq_${RELEASE}/ 2>/dev/null || true
+rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+
+cd refseq_${RELEASE}
+
+# Extract and combine sequences
+echo "Extracting and combining sequences..."
+
+# Extract all downloaded genomic sequences
+if [ $(find . -name "*.genomic.fna.gz" -type f | wc -l) -gt 0 ]; then
+    echo "Extracting genomic sequences..."
+    find . -name "*.genomic.fna.gz" -type f -exec gunzip {} \;
+fi
+
+# Combine all FASTA files into one
+echo "Combining all FASTA files..."
+FASTA_FILES=$(find . -name "*.fna" -type f)
+if [ -z "$FASTA_FILES" ]; then
+    FASTA_FILES=$(find . -name "*.fa" -type f)
+fi
+
+if [ -z "$FASTA_FILES" ]; then
+    echo "Error: No FASTA files found to combine"
+    exit 1
+fi
+
+echo "$FASTA_FILES" | while read -r file; do
+    if [ -f "$file" ]; then
+        cat "$file" >> refseq_${RELEASE}.fasta
+    fi
+done
+
+# Check if we have sequences
+if [ ! -s "refseq_${RELEASE}.fasta" ]; then
+    echo "Error: Combined FASTA file is empty"
+    exit 1
+fi
+
+echo "Creating BLAST database..."
+# Create BLAST database for DNA sequences (use -dbtype nucl for nucleotide)
+makeblastdb -in refseq_${RELEASE}.fasta \
+    -out refseq_${RELEASE} \
+    -dbtype nucl \
+    -parse_seqids \
+    -title "RefSeq_${RELEASE}"
+
+echo "BLAST database created successfully!"
+echo "Database location: $(pwd)/refseq_${RELEASE}"
+echo ""
+echo "To use this database, set in your config:"
+echo "  local_blast_db: $(pwd)/refseq_${RELEASE}"
+echo ""
+echo "Note: The database files are:"
+ls -lh refseq_${RELEASE}.*
+
+cd ..
+
diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/build_db/build_protein_blast_db.sh
new file mode 100755
index 00000000..a9169959
--- /dev/null
+++ b/examples/search/build_db/build_protein_blast_db.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+set -e
+
+# Downloads the latest release of UniProt, putting it in a release-specific directory.
+# Creates associated BLAST databases.
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+# NOTE: UniProt mirror
+# Available mirrors:
+#   - UK/EBI: ftp://ftp.ebi.ac.uk/pub/databases/uniprot (current, recommended)
+#   - US:     ftp://ftp.uniprot.org/pub/databases/uniprot
+#   - CH:     ftp://ftp.expasy.org/databases/uniprot
+UNIPROT_BASE="ftp://ftp.ebi.ac.uk/pub/databases/uniprot"
+
+# Parse command line arguments
+DOWNLOAD_MODE="sprot"  # sprot (Swiss-Prot) or full (sprot + trembl)
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  -s, --sprot-only    Download only Swiss-Prot database (recommended, high quality)"
+    echo "  -f, --full          Download full release (Swiss-Prot + TrEMBL, merged as uniprot_\${RELEASE})"
+    echo "  -h, --help          Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --sprot-only     # Download only uniprot_sprot"
+    echo "  $0 --full           # Download uniprot_\${RELEASE} (Swiss-Prot + TrEMBL)"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -s|--sprot-only)
+            DOWNLOAD_MODE="sprot"
+            shift
+            ;;
+        -f|--full)
+            DOWNLOAD_MODE="full"
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+echo "Download mode: ${DOWNLOAD_MODE}"
+if [ "${DOWNLOAD_MODE}" = "sprot" ]; then
+    echo "  - Will download: uniprot_sprot only"
+else
+    echo "  - Will download: uniprot_\${RELEASE} (Swiss-Prot + TrEMBL merged)"
+fi
+echo "Using mirror: ${UNIPROT_BASE} (EBI/UK - fast for Asia/Europe)"
+echo ""
+
+# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+DOWNLOAD_TMP=_downloading
+mkdir -p ${DOWNLOAD_TMP}
+cd ${DOWNLOAD_TMP}
+
+echo "Downloading RELEASE.metalink..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink"
+
+# Extract the release name (like 2017_10 or 2017_1)
+# Use sed for cross-platform compatibility (works on both macOS and Linux)
+RELEASE=$(sed -n 's/.*<version>\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1)
+
+echo "UniProt release: ${RELEASE}"
+echo ""
+
+# Download Swiss-Prot (always needed)
+echo "Downloading uniprot_sprot.fasta.gz..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+
+# Download TrEMBL only if full mode
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    echo "Downloading uniprot_trembl.fasta.gz..."
+    wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+fi
+
+# Download metadata files
+echo "Downloading metadata files..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt"
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README"
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE"
+
+cd ..
+
+mkdir -p ${RELEASE}
+mv ${DOWNLOAD_TMP}/* ${RELEASE}
+rmdir ${DOWNLOAD_TMP}
+
+cd ${RELEASE}
+
+echo ""
+echo "Extracting files..."
+gunzip uniprot_sprot.fasta.gz
+
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    gunzip uniprot_trembl.fasta.gz
+    echo "Merging Swiss-Prot and TrEMBL..."
+    cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
+fi
+
+echo ""
+echo "Building BLAST databases..."
+
+# Always build Swiss-Prot database
+makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot
+
+# Build full release database only if in full mode
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
+    makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
+fi
+
+cd ..
+
+echo ""
+echo "BLAST databases created successfully!"
+echo "Database locations:"
+if [ "${DOWNLOAD_MODE}" = "sprot" ]; then
+    echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
+    echo ""
+    echo "To use this database, set in your config:"
+    echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot"
+else
+    echo "  - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
+    echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
+    echo "  - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
+    echo ""
+    echo "To use these databases, set in your config:"
+    echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot  # or uniprot_${RELEASE} or uniprot_trembl"
+fi
+
diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/build_db/build_rna_blast_db.sh
new file mode 100755
index 00000000..af688ac1
--- /dev/null
+++ b/examples/search/build_db/build_rna_blast_db.sh
@@ -0,0 +1,433 @@
+#!/bin/bash
+
+set -e
+
+# Downloads RNAcentral sequences and creates BLAST databases.
+# This script downloads the RNAcentral active database, which is the same
+# data source used for online RNAcentral searches, ensuring consistency
+# between local and online search results.
+#
+# RNAcentral is a comprehensive database of non-coding RNA sequences that
+# integrates data from multiple expert databases including RefSeq, Rfam, etc.
+#
+# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...]
+#   all (default): Download complete active database (~8.4G compressed)
+#   list: List all available database subsets
+#   selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase)
+#   database_name: Download specific database subset (e.g., refseq, rfam, mirbase)
+#   database_name1 database_name2 ...: Download multiple database subsets
+#
+# Available database subsets (examples):
+#   - refseq.fasta (~98M): RefSeq RNA sequences
+#   - rfam.fasta (~1.5G): Rfam RNA families
+#   - mirbase.fasta (~10M): microRNA sequences
+#   - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human)
+#   - gtrnadb.fasta (~38M): tRNA sequences
+#   - lncbase.fasta (~106K): Human lncRNA database
+#   - See "list" option for complete list
+#
+# The complete "active" database contains all sequences from all expert databases.
+# Using a specific database subset provides a smaller, focused database.
+#
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+# RNAcentral base URL (using EBI HTTPS)
+# NOTE: RNAcentral only has one official mirror at EBI
+RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral"
+RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release"
+RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences"
+RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database"
+
+# Parse command line arguments
+DB_SELECTION=${1:-selected}
+
+# Predefined database list for "selected" option
+SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase" "rfam")
+
+# List available databases if requested
+if [ "${DB_SELECTION}" = "list" ]; then
+    echo "Available RNAcentral database subsets:"
+    echo ""
+    echo "Fetching list from RNAcentral..."
+    listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/")
+    echo "${listing}" | \
+        grep -oE '<a href="[^\"]*\.fasta">' | \
+        sed 's/<a href="//;s/">//' | \
+        sort | \
+        while read db; do
+            size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown")
+            echo "  - ${db%.fasta}: ${size}"
+        done
+    echo ""
+    echo "Usage: $0 [all|list|selected|database_name...]"
+    echo "  Example: $0 refseq    # Download only RefSeq sequences (~98M)"
+    echo "  Example: $0 rfam      # Download only Rfam sequences (~1.5G)"
+    echo "  Example: $0 selected   # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam)"
+    echo "  Example: $0 refseq mirbase  # Download multiple databases"
+    echo "  Example: $0 all       # Download complete active database (~8.4G)"
+    exit 0
+fi
+
+# Determine which databases to download
+if [ "${DB_SELECTION}" = "selected" ]; then
+    # Use predefined database list
+    DATABASES=("${SELECTED_DATABASES[@]}")
+    echo "Downloading selected databases: ${DATABASES[*]}"
+elif [ "${DB_SELECTION}" = "all" ]; then
+    # Single database mode (all)
+    DATABASES=("all")
+else
+    # Multiple databases provided as arguments
+    DATABASES=("$@")
+fi
+
+# Get RNAcentral release version from release notes (once for all databases)
+echo "Getting RNAcentral release information..."
+RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt"
+RELEASE_NOTES_TMP=$(mktemp)
+wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || {
+    echo "Warning: Could not download release notes, using current date as release identifier"
+    RELEASE=$(date +%Y%m%d)
+}
+
+if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then
+    # Try to extract version from release notes (first line usually contains version info)
+    RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.')
+    rm -f "${RELEASE_NOTES_TMP}"
+fi
+
+if [ -z "${RELEASE}" ]; then
+    RELEASE=$(date +%Y%m%d)
+    echo "Using date as release identifier: ${RELEASE}"
+else
+    echo "RNAcentral release: ${RELEASE}"
+fi
+
+# Process each database
+DB_COUNT=${#DATABASES[@]}
+DB_INDEX=0
+
+for DB_SELECTION in "${DATABASES[@]}"; do
+    DB_INDEX=$((DB_INDEX + 1))
+    echo ""
+    echo "=========================================="
+    echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}"
+    echo "=========================================="
+    echo ""
+    
+    # Check if database already exists and is complete
+    # First check with current release version
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+        DB_NAME="rnacentral"
+        DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    else
+        OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}"
+        DB_NAME="${DB_SELECTION}"
+        DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    fi
+    
+    # Check if BLAST database already exists with current release
+    if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then
+        echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/"
+        echo "  BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}"
+        echo "  Skipping download and database creation..."
+        continue
+    fi
+    
+    # Also check for any existing version of this database (e.g., different release dates)
+    EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1)
+    if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then
+        EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//")
+        if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then
+            echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/"
+            echo "  BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}"
+            echo "  Skipping download and database creation..."
+            echo "  Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}"
+            continue
+        fi
+    fi
+    
+    # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+    DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}"
+    mkdir -p ${DOWNLOAD_TMP}
+    cd ${DOWNLOAD_TMP}
+    
+    # Download RNAcentral FASTA file
+    if [ "${DB_SELECTION}" = "all" ]; then
+        # Download complete active database
+        FASTA_FILE="rnacentral_active.fasta.gz"
+        DB_NAME="rnacentral"
+        echo "Downloading RNAcentral active sequences (~8.4G)..."
+        echo "  Contains sequences currently present in at least one expert database"
+        echo "  Uses standard URS IDs (e.g., URS000149A9AF)"
+        echo "  ⭐ MATCHES the online RNAcentral API database - ensures consistency"
+        FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}"
+        IS_COMPRESSED=true
+    else
+        # Download specific database subset
+        DB_NAME="${DB_SELECTION}"
+        FASTA_FILE="${DB_SELECTION}.fasta"
+        echo "Downloading RNAcentral database subset: ${DB_SELECTION}"
+        echo "  This is a subset of the active database from a specific expert database"
+        echo "  File: ${FASTA_FILE}"
+        FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}"
+        IS_COMPRESSED=false
+        
+        # Check if database exists (use HTTP status code check for HTTPS)
+        HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000")
+        if ! echo "${HTTP_CODE}" | grep -q "^200$"; then
+            echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})"
+            echo "Run '$0 list' to see available databases"
+            cd ..
+            rm -rf ${DOWNLOAD_TMP}
+            exit 1
+        fi
+    fi
+    
+    echo "Downloading from: ${FASTA_URL}"
+    echo "This may take a while depending on your internet connection..."
+    if [ "${DB_SELECTION}" = "all" ]; then
+        echo "File size is approximately 8-9GB, please be patient..."
+    else
+        echo "Downloading database subset..."
+    fi
+    
+    wget -c "${FASTA_URL}" || {
+        echo "Error: Failed to download RNAcentral FASTA file"
+        echo "Please check your internet connection and try again"
+        echo "URL: ${FASTA_URL}"
+        cd ..
+        rm -rf ${DOWNLOAD_TMP}
+        exit 1
+    }
+    
+    if [ ! -f "${FASTA_FILE}" ]; then
+        echo "Error: Downloaded file not found"
+        cd ..
+        rm -rf ${DOWNLOAD_TMP}
+        exit 1
+    fi
+    
+    cd ..
+    
+    # Create release directory
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+    else
+        OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}"
+    fi
+    mkdir -p ${OUTPUT_DIR}
+    mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true
+    rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+    
+    cd ${OUTPUT_DIR}
+    
+    # Extract FASTA file if compressed
+    echo "Preparing RNAcentral sequences..."
+    if [ -f "${FASTA_FILE}" ]; then
+        if [ "${IS_COMPRESSED}" = "true" ]; then
+            echo "Decompressing ${FASTA_FILE}..."
+            OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+            gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || {
+                echo "Error: Failed to decompress FASTA file"
+                cd ..
+                exit 1
+            }
+            # Optionally remove the compressed file to save space
+            # rm "${FASTA_FILE}"
+        else
+            # File is not compressed, just copy/rename
+            OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+            cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || {
+                echo "Error: Failed to copy FASTA file"
+                cd ..
+                exit 1
+            }
+        fi
+    else
+        echo "Error: FASTA file not found"
+        cd ..
+        exit 1
+    fi
+    
+    # Check if we have sequences
+    if [ ! -s "${OUTPUT_FASTA}" ]; then
+        echo "Error: FASTA file is empty"
+        cd ..
+        exit 1
+    fi
+    
+    # Get file size for user information
+    FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1)
+    echo "FASTA file size: ${FILE_SIZE}"
+    
+    echo "Creating BLAST database..."
+    # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
+    # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers,
+    # which matches the format expected by the RNACentralSearch class
+    DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    makeblastdb -in "${OUTPUT_FASTA}" \
+        -out "${DB_OUTPUT_NAME}" \
+        -dbtype nucl \
+        -parse_seqids \
+        -title "RNAcentral_${DB_NAME}_${RELEASE}"
+    
+    echo ""
+    echo "BLAST database created successfully!"
+    echo "Database location: $(pwd)/${DB_OUTPUT_NAME}"
+    echo ""
+    echo "To use this database, set in your config (search_rna_config.yaml):"
+    echo "  rnacentral_params:"
+    echo "    use_local_blast: true"
+    echo "    local_blast_db: $(pwd)/${DB_OUTPUT_NAME}"
+    echo ""
+    echo "Note: The database files are:"
+    ls -lh ${DB_OUTPUT_NAME}.* | head -5
+    echo ""
+    if [ "${DB_SELECTION}" = "all" ]; then
+        echo "This database uses RNAcentral IDs (URS...), which matches the online"
+        echo "RNAcentral search API, ensuring consistent results between local and online searches."
+    else
+        echo "This is a subset database from ${DB_SELECTION} expert database."
+        echo "For full coverage matching online API, use 'all' option."
+    fi
+    
+    cd ..
+done
+
+echo ""
+echo "=========================================="
+echo "All databases processed successfully!"
+echo "=========================================="
+echo ""
+
+# If multiple databases were downloaded, offer to merge them
+if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then
+    echo "Multiple databases downloaded. Creating merged database for unified search..."
+    MERGED_DIR="rnacentral_merged_${RELEASE}"
+    mkdir -p ${MERGED_DIR}
+    cd ${MERGED_DIR}
+    
+    MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta"
+    MERGED_FASTA_TMP="${MERGED_FASTA}.tmp"
+    echo "Combining FASTA files from all databases..."
+    echo "  Note: Duplicate sequence IDs will be removed (keeping first occurrence)..."
+    
+    # Combine all FASTA files into a temporary file
+    # Find actual database directories (may have different release versions)
+    FOUND_ANY=false
+    for DB_SELECTION in "${DATABASES[@]}"; do
+        [ "${DB_SELECTION}" = "all" ] && continue
+        
+        # Try current release version first, then search for any existing version
+        OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta"
+        [ ! -f "${OUTPUT_FASTA}" ] && {
+            EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1)
+            [ -n "${EXISTING_DIR}" ] && {
+                EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//")
+                OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta"
+            }
+        }
+        
+        if [ -f "${OUTPUT_FASTA}" ]; then
+            echo "  Adding ${DB_SELECTION} sequences..."
+            cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}"
+            FOUND_ANY=true
+        else
+            echo "  Warning: Could not find FASTA file for ${DB_SELECTION}"
+        fi
+    done
+    
+    # Validate that we have files to merge
+    if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then
+        echo "Error: No FASTA files found to merge"
+        cd ..
+        rm -rf ${MERGED_DIR}
+        exit 1
+    fi
+    
+    # Remove duplicates based on sequence ID (keeping first occurrence)
+    echo "  Removing duplicate sequence IDs..."
+    awk '
+    /^>/ {
+        # Process previous sequence if we have one
+        if (current_id != "" && !seen[current_id]) {
+            print current_header ORS current_seq
+            seen[current_id] = 1
+        }
+        # Start new sequence
+        current_header = $0
+        current_id = substr($0, 2)
+        sub(/[ \t].*/, "", current_id)  # Extract ID up to first space/tab
+        current_seq = ""
+        next
+    }
+    {
+        # Accumulate sequence data by concatenating lines
+        current_seq = current_seq $0
+    }
+    END {
+        # Process last sequence
+        if (current_id != "" && !seen[current_id]) {
+            print current_header ORS current_seq
+        }
+    }
+    ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}"
+    rm -f "${MERGED_FASTA_TMP}"
+    
+    # Check if merged file was created and has content
+    if [ ! -s "${MERGED_FASTA}" ]; then
+        echo "Warning: Merged FASTA file is empty or not created"
+        cd ..
+        rm -rf ${MERGED_DIR}
+    else
+        FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1)
+        echo "Merged FASTA file size: ${FILE_SIZE}"
+        
+        echo "Creating merged BLAST database..."
+        MERGED_DB_NAME="rnacentral_merged_${RELEASE}"
+        makeblastdb -in "${MERGED_FASTA}" \
+            -out "${MERGED_DB_NAME}" \
+            -dbtype nucl \
+            -parse_seqids \
+            -title "RNAcentral_Merged_${RELEASE}"
+        
+        echo ""
+        echo "✓ Merged BLAST database created successfully!"
+        echo "Database location: $(pwd)/${MERGED_DB_NAME}"
+        echo ""
+        echo "To use the merged database, set in your config (search_rna_config.yaml):"
+        echo "  rnacentral_params:"
+        echo "    use_local_blast: true"
+        echo "    local_blast_db: $(pwd)/${MERGED_DB_NAME}"
+        echo ""
+        echo "Note: The merged database includes: ${DATABASES[*]}"
+        cd ..
+    fi
+fi
+
+echo ""
+echo "Summary of downloaded databases:"
+for DB_SELECTION in "${DATABASES[@]}"; do
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+        DB_NAME="rnacentral"
+    else
+        OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}"
+        DB_NAME="${DB_SELECTION}"
+    fi
+    if [ -d "${OUTPUT_DIR}" ]; then
+        echo "  - ${DB_NAME}: ${OUTPUT_DIR}/"
+    fi
+done
+
+if [ -d "rnacentral_merged_${RELEASE}" ]; then
+    echo "  - merged (all databases): rnacentral_merged_${RELEASE}/"
+    echo ""
+    echo "💡 Recommendation: Use the merged database for searching across all databases."
+fi
+
diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh
new file mode 100644
index 00000000..e05ab751
--- /dev/null
+++ b/examples/search/search_dna.sh
@@ -0,0 +1,4 @@
+python3 -m graphgen.run \
+--config_file examples/search/search_dna_config.yaml \
+--output_dir cache/
+
diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna_config.yaml
new file mode 100644
index 00000000..9a469aff
--- /dev/null
+++ b/examples/search/search_dna_config.yaml
@@ -0,0 +1,30 @@
+global_params:
+  working_dir: cache
+
+nodes:
+  - id: read_step
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: search_step
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_step # search_step depends on read_step
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    params:
+      data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      ncbi_params:
+        email: test@example.com # NCBI requires an email address
+        tool: GraphGen # tool name for NCBI API
+        use_local_blast: true # whether to use local blast for DNA search
+        local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
+
diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein_config.yaml
new file mode 100644
index 00000000..c33e5096
--- /dev/null
+++ b/examples/search/search_protein_config.yaml
@@ -0,0 +1,28 @@
+global_params:
+  working_dir: cache
+
+nodes:
+  - id: read_step
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: search_step
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_step # search_step depends on read_step
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    params:
+      data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
+      uniprot_params:
+        use_local_blast: true # whether to use local blast for uniprot search
+        local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
+        # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh
new file mode 100644
index 00000000..2bf5a406
--- /dev/null
+++ b/examples/search/search_rna.sh
@@ -0,0 +1,4 @@
+python3 -m graphgen.run \
+--config_file examples/search/search_rna_config.yaml \
+--output_dir cache/
+
diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna_config.yaml
new file mode 100644
index 00000000..ba6ba3f3
--- /dev/null
+++ b/examples/search/search_rna_config.yaml
@@ -0,0 +1,27 @@
+global_params:
+  working_dir: cache
+
+nodes:
+  - id: read_step
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: search_step
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_step # search_step depends on read_step
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    params:
+      data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      rnacentral_params:
+        use_local_blast: true # whether to use local blast for RNA search
+        local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh
new file mode 100644
index 00000000..fbb33d70
--- /dev/null
+++ b/examples/search/search_uniprot.sh
@@ -0,0 +1,3 @@
+python3 -m graphgen.run \
+--config_file examples/search/search_protein_config.yaml \
+--output_dir cache/
diff --git a/graphgen/version.py b/graphgen/_version.py
similarity index 72%
rename from graphgen/version.py
rename to graphgen/_version.py
index 73315e64..a7ae27a2 100644
--- a/graphgen/version.py
+++ b/graphgen/_version.py
@@ -1,7 +1,6 @@
-
 from typing import Tuple
 
-__version__ = '20250416'
+__version__ = "0.1.0.post20250930"
 short_version = __version__
 
 
@@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple:
         tuple: A sequence of integer and string represents version.
     """
     _version_info = []
-    for x in version_str.split('.'):
+    for x in version_str.split("."):
         if x.isdigit():
             _version_info.append(int(x))
-        elif x.find('rc') != -1:
-            patch_version = x.split('rc')
+        elif x.find("rc") != -1:
+            patch_version = x.split("rc")
             _version_info.append(int(patch_version[0]))
-            _version_info.append(f'rc{patch_version[1]}')
+            _version_info.append(f"rc{patch_version[1]}")
     return tuple(_version_info)
 
 
diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py
new file mode 100644
index 00000000..41136974
--- /dev/null
+++ b/graphgen/bases/__init__.py
@@ -0,0 +1,12 @@
+from .base_extractor import BaseExtractor
+from .base_generator import BaseGenerator
+from .base_kg_builder import BaseKGBuilder
+from .base_llm_wrapper import BaseLLMWrapper
+from .base_operator import BaseOperator
+from .base_partitioner import BasePartitioner
+from .base_reader import BaseReader
+from .base_searcher import BaseSearcher
+from .base_splitter import BaseSplitter
+from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace
+from .base_tokenizer import BaseTokenizer
+from .datatypes import Chunk, Config, Node, QAPair, Token
diff --git a/graphgen/bases/base_extractor.py b/graphgen/bases/base_extractor.py
new file mode 100644
index 00000000..054e6e3a
--- /dev/null
+++ b/graphgen/bases/base_extractor.py
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseExtractor(ABC):
+    """
+    Extract information from given text.
+
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @abstractmethod
+    async def extract(self, chunk: dict) -> Any:
+        """Extract information from the given text"""
+
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""
diff --git a/graphgen/bases/base_generator.py b/graphgen/bases/base_generator.py
new file mode 100644
index 00000000..85de5877
--- /dev/null
+++ b/graphgen/bases/base_generator.py
@@ -0,0 +1,83 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseGenerator(ABC):
+    """
+    Generate QAs based on given prompts.
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @staticmethod
+    @abstractmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """Build prompt for LLM based on the given batch"""
+
+    @staticmethod
+    @abstractmethod
+    def parse_response(response: str) -> Any:
+        """Parse the LLM response and return the generated QAs"""
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {"from": "human", "value": v["question"]},
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {"role": "user", "content": v["question"]},
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results
diff --git a/graphgen/bases/base_kg_builder.py b/graphgen/bases/base_kg_builder.py
new file mode 100644
index 00000000..d8a5d66a
--- /dev/null
+++ b/graphgen/bases/base_kg_builder.py
@@ -0,0 +1,39 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+
+
+class BaseKGBuilder(ABC):
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+        self._nodes: Dict[str, List[dict]] = defaultdict(list)
+        self._edges: Dict[Tuple[str, str], List[dict]] = defaultdict(list)
+
+    @abstractmethod
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """Extract nodes and edges from a single chunk."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def merge_nodes(
+        self,
+        node_data: tuple[str, List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        """Merge extracted nodes into the knowledge graph."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def merge_edges(
+        self,
+        edges_data: tuple[Tuple[str, str], List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        """Merge extracted edges into the knowledge graph."""
+        raise NotImplementedError
diff --git a/graphgen/bases/base_llm_wrapper.py b/graphgen/bases/base_llm_wrapper.py
new file mode 100644
index 00000000..8b6dbec7
--- /dev/null
+++ b/graphgen/bases/base_llm_wrapper.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import abc
+import re
+from typing import Any, List, Optional
+
+from graphgen.bases.base_tokenizer import BaseTokenizer
+from graphgen.bases.datatypes import Token
+
+
+class BaseLLMWrapper(abc.ABC):
+    """
+    LLM client base class, agnostic to specific backends (OpenAI / Ollama / ...).
+    """
+
+    def __init__(
+        self,
+        *,
+        system_prompt: str = "",
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+        repetition_penalty: float = 1.05,
+        top_p: float = 0.95,
+        top_k: int = 50,
+        tokenizer: Optional[BaseTokenizer] = None,
+        **kwargs: Any,
+    ):
+        self.system_prompt = system_prompt
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.repetition_penalty = repetition_penalty
+        self.top_p = top_p
+        self.top_k = top_k
+        self.tokenizer = tokenizer
+
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    @abc.abstractmethod
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> str:
+        """Generate answer from the model."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        """Generate top-k tokens for the next token prediction."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        """Generate probabilities for each token in the input."""
+        raise NotImplementedError
+
+    @staticmethod
+    def filter_think_tags(text: str, think_tag: str = "think") -> str:
+        """
+        Remove <think> tags from the text.
+        - If the text contains <think> and </think>, it removes everything between them and the tags themselves.
+        - If the text contains only </think>, it removes content before the tag.
+        """
+        paired_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
+        filtered = paired_pattern.sub("", text)
+
+        orphan_pattern = re.compile(rf"^.*?</{think_tag}>", re.DOTALL)
+        filtered = orphan_pattern.sub("", filtered)
+
+        filtered = filtered.strip()
+        return filtered if filtered else text.strip()
diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py
new file mode 100644
index 00000000..8d0cddb5
--- /dev/null
+++ b/graphgen/bases/base_operator.py
@@ -0,0 +1,59 @@
+import inspect
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Iterable, Union
+
+import pandas as pd
+import ray
+
+from graphgen.utils.log import CURRENT_LOGGER_VAR, set_logger
+
+
+class BaseOperator(ABC):
+    def __init__(self, working_dir: str = "cache", op_name: str = None):
+        log_dir = os.path.join(working_dir, "logs")
+        self.op_name = op_name or self.__class__.__name__
+
+        try:
+            ctx = ray.get_runtime_context()
+            worker_id = ctx.get_actor_id() or ctx.get_worker_id()
+            worker_id_short = worker_id[-6:] if worker_id else "driver"
+        except Exception as e:
+            print(
+                "Warning: Could not get Ray worker ID, defaulting to 'local'. Exception:",
+                e,
+            )
+            worker_id_short = "local"
+
+        # e.g. cache/logs/ChunkService_a1b2c3.log
+        log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log")
+
+        self.logger = set_logger(
+            log_file=log_file, name=f"{self.op_name}.{worker_id_short}",
+            console_level=logging.ERROR, force=True
+        )
+
+        self.logger.info(
+            "[%s] Operator initialized on Worker %s", self.op_name, worker_id_short
+        )
+
+    def __call__(
+        self, batch: pd.DataFrame
+    ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]:
+        logger_token = CURRENT_LOGGER_VAR.set(self.logger)
+        try:
+            result = self.process(batch)
+            if inspect.isgenerator(result):
+                yield from result
+            else:
+                yield result
+        finally:
+            CURRENT_LOGGER_VAR.reset(logger_token)
+
+    @abstractmethod
+    def process(self, batch):
+        raise NotImplementedError("Subclasses must implement the process method.")
+
+    def get_logger(self):
+        return self.logger
diff --git a/graphgen/bases/base_partitioner.py b/graphgen/bases/base_partitioner.py
new file mode 100644
index 00000000..d948e3a7
--- /dev/null
+++ b/graphgen/bases/base_partitioner.py
@@ -0,0 +1,69 @@
+from abc import ABC, abstractmethod
+from typing import Any, List
+
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+
+
+class BasePartitioner(ABC):
+    @abstractmethod
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Graph -> Communities
+        :param g: Graph storage instance
+        :param kwargs: Additional parameters for partitioning
+        :return: List of communities
+        """
+
+    @staticmethod
+    def community2batch(
+        comm: Community, g: BaseGraphStorage
+    ) -> tuple[
+        list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+    ]:
+        """
+        Convert communities to batches of nodes and edges.
+        :param comm: Community
+        :param g: Graph storage instance
+        :return: List of batches, each batch is a tuple of (nodes, edges)
+        """
+        nodes = comm.nodes
+        edges = comm.edges
+        nodes_data = []
+        for node in nodes:
+            node_data = g.get_node(node)
+            if node_data:
+                nodes_data.append((node, node_data))
+        edges_data = []
+        for u, v in edges:
+            edge_data = g.get_edge(u, v)
+            if edge_data:
+                edges_data.append((u, v, edge_data))
+            else:
+                edge_data = g.get_edge(v, u)
+                if edge_data:
+                    edges_data.append((v, u, edge_data))
+        return nodes_data, edges_data
+
+    @staticmethod
+    def _build_adjacency_list(
+        nodes: List[tuple[str, dict]], edges: List[tuple[str, str, dict]]
+    ) -> tuple[dict[str, List[str]], set[tuple[str, str]]]:
+        """
+        Build adjacency list and edge set from nodes and edges.
+        :param nodes
+        :param edges
+        :return: adjacency list, edge set
+        """
+        adj: dict[str, List[str]] = {n[0]: [] for n in nodes}
+        edge_set: set[tuple[str, str]] = set()
+        for e in edges:
+            adj[e[0]].append(e[1])
+            adj[e[1]].append(e[0])
+            edge_set.add((e[0], e[1]))
+            edge_set.add((e[1], e[0]))
+        return adj, edge_set
diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
new file mode 100644
index 00000000..ba72f410
--- /dev/null
+++ b/graphgen/bases/base_reader.py
@@ -0,0 +1,83 @@
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+
+import pandas as pd
+import requests
+from ray.data import Dataset
+
+
+class BaseReader(ABC):
+    """
+    Abstract base class for reading and processing data.
+    """
+
+    def __init__(self, text_column: str = "content", modalities: list = None):
+        self.text_column = text_column
+        self.modalities = modalities if modalities is not None else ["text"]
+
+    @abstractmethod
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
+        """
+        Read data from the specified file path.
+
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
+        """
+
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
+        """
+        Determine whether to keep the given item based on the text column.
+
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
+        """
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+            "dna",
+            "rna",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
+
+    def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Validate data format.
+        """
+        if "type" not in batch.columns:
+            raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
+
+        if "text" in batch["type"].values:
+            if self.text_column not in batch.columns:
+                raise ValueError(
+                    f"Missing '{self.text_column}' column for text documents"
+                )
+
+        return batch
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py
new file mode 100644
index 00000000..cba0315e
--- /dev/null
+++ b/graphgen/bases/base_searcher.py
@@ -0,0 +1,47 @@
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+from graphgen.utils.log import set_logger
+
+
+class BaseSearcher(ABC):
+    """
+    Abstract base class for searching and retrieving data.
+    """
+
+    def __init__(self, working_dir: str = "cache"):
+        """
+        Initialize the base searcher with a logger.
+
+        :param working_dir: Working directory for log files.
+        """
+        log_dir = os.path.join(working_dir, "logs")
+        searcher_name = self.__class__.__name__
+
+        # e.g. cache/logs/NCBISearch.log
+        log_file = os.path.join(log_dir, f"{searcher_name}.log")
+
+        self.logger = set_logger(
+            log_file=log_file, name=searcher_name,
+            console_level=logging.ERROR, force=True
+        )
+
+        self.logger.info(
+            "[%s] Searcher initialized", searcher_name
+        )
+
+    @abstractmethod
+    async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Search for data based on the given query.
+
+        :param query: The searcher query.
+        :param kwargs: Additional keyword arguments for the searcher.
+        :return: List of dictionaries containing the searcher results.
+        """
+
+    def get_logger(self):
+        """Get the logger instance."""
+        return self.logger
diff --git a/graphgen/bases/base_splitter.py b/graphgen/bases/base_splitter.py
new file mode 100644
index 00000000..f77be6e4
--- /dev/null
+++ b/graphgen/bases/base_splitter.py
@@ -0,0 +1,142 @@
+import copy
+import re
+from abc import ABC, abstractmethod
+from typing import Callable, Iterable, List, Literal, Optional, Union
+
+from graphgen.bases.datatypes import Chunk
+from graphgen.utils.log import logger
+
+
+class BaseSplitter(ABC):
+    """
+    Abstract base class for splitting text into smaller chunks.
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = 1024,
+        chunk_overlap: int = 100,
+        length_function: Callable[[str], int] = len,
+        keep_separator: bool = False,
+        add_start_index: bool = False,
+        strip_whitespace: bool = True,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.length_function = length_function
+        self.keep_separator = keep_separator
+        self.add_start_index = add_start_index
+        self.strip_whitespace = strip_whitespace
+
+    @abstractmethod
+    def split_text(self, text: str) -> List[str]:
+        """
+        Split the input text into smaller chunks.
+
+        :param text: The input text to be chunk.
+        :return: A list of text chunks.
+        """
+
+    def create_chunks(
+        self, texts: List[str], metadatas: Optional[List[dict]] = None
+    ) -> List[Chunk]:
+        """Create chunks from a list of texts."""
+        _metadatas = metadatas or [{}] * len(texts)
+        chunks = []
+        for i, text in enumerate(texts):
+            index = 0
+            previous_chunk_len = 0
+            for chunk in self.split_text(text):
+                metadata = copy.deepcopy(_metadatas[i])
+                if self.add_start_index:
+                    offset = index + previous_chunk_len - self.chunk_overlap
+                    index = text.find(chunk, max(0, offset))
+                    metadata["start_index"] = index
+                    previous_chunk_len = len(chunk)
+                new_chunk = Chunk(content=chunk, metadata=metadata)
+                chunks.append(new_chunk)
+        return chunks
+
+    def _join_chunks(self, chunks: List[str], separator: str) -> Optional[str]:
+        text = separator.join(chunks)
+        if self.strip_whitespace:
+            text = text.strip()
+        if text == "":
+            return None
+        return text
+
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+        # We now want to combine these smaller pieces into medium size chunks to send to the LLM.
+        separator_len = self.length_function(separator)
+
+        chunks = []
+        current_chunk: List[str] = []
+        total = 0
+        for d in splits:
+            _len = self.length_function(d)
+            if (
+                total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                > self.chunk_size
+            ):
+                if total > self.chunk_size:
+                    logger.warning(
+                        "Created a chunk of size %s, which is longer than the specified %s",
+                        total,
+                        self.chunk_size,
+                    )
+                if len(current_chunk) > 0:
+                    chunk = self._join_chunks(current_chunk, separator)
+                    if chunk is not None:
+                        chunks.append(chunk)
+                    # Keep on popping if:
+                    # - we have a larger chunk than in the chunk overlap
+                    # - or if we still have any chunks and the length is long
+                    while total > self.chunk_overlap or (
+                        total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                        > self.chunk_size
+                        and total > 0
+                    ):
+                        total -= self.length_function(current_chunk[0]) + (
+                            separator_len if len(current_chunk) > 1 else 0
+                        )
+                        current_chunk = current_chunk[1:]
+            current_chunk.append(d)
+            total += _len + (separator_len if len(current_chunk) > 1 else 0)
+        chunk = self._join_chunks(current_chunk, separator)
+        if chunk is not None:
+            chunks.append(chunk)
+        return chunks
+
+    @staticmethod
+    def _split_text_with_regex(
+        text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
+    ) -> List[str]:
+        # Now that we have the separator, chunk the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = (
+                    (
+                        [
+                            _splits[i] + _splits[i + 1]
+                            for i in range(0, len(_splits) - 1, 2)
+                        ]
+                    )
+                    if keep_separator == "end"
+                    else (
+                        [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
+                    )
+                )
+                if len(_splits) % 2 == 0:
+                    splits += _splits[-1:]
+                splits = (
+                    (splits + [_splits[-1]])
+                    if keep_separator == "end"
+                    else ([_splits[0]] + splits)
+                )
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]
diff --git a/graphgen/bases/base_storage.py b/graphgen/bases/base_storage.py
new file mode 100644
index 00000000..ff7d2d1a
--- /dev/null
+++ b/graphgen/bases/base_storage.py
@@ -0,0 +1,96 @@
+from dataclasses import dataclass
+from typing import Generic, TypeVar, Union
+
+T = TypeVar("T")
+
+
+@dataclass
+class StorageNameSpace:
+    working_dir: str = None
+    namespace: str = None
+
+    def index_done_callback(self):
+        """commit the storage operations after indexing"""
+
+    def query_done_callback(self):
+        """commit the storage operations after querying"""
+
+
+class BaseKVStorage(Generic[T], StorageNameSpace):
+    def all_keys(self) -> list[str]:
+        raise NotImplementedError
+
+    def get_by_id(self, id: str) -> Union[T, None]:
+        raise NotImplementedError
+
+    def get_by_ids(
+        self, ids: list[str], fields: Union[set[str], None] = None
+    ) -> list[Union[T, None]]:
+        raise NotImplementedError
+
+    def get_all(self) -> dict[str, T]:
+        raise NotImplementedError
+
+    def filter_keys(self, data: list[str]) -> set[str]:
+        """return un-exist keys"""
+        raise NotImplementedError
+
+    def upsert(self, data: dict[str, T]):
+        raise NotImplementedError
+
+    def drop(self):
+        raise NotImplementedError
+
+    def reload(self):
+        raise NotImplementedError
+
+
+class BaseGraphStorage(StorageNameSpace):
+    def has_node(self, node_id: str) -> bool:
+        raise NotImplementedError
+
+    def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        raise NotImplementedError
+
+    def node_degree(self, node_id: str) -> int:
+        raise NotImplementedError
+
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        raise NotImplementedError
+
+    def get_node(self, node_id: str) -> Union[dict, None]:
+        raise NotImplementedError
+
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+
+    def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]:
+        raise NotImplementedError
+
+    def get_edge(self, source_node_id: str, target_node_id: str) -> Union[dict, None]:
+        raise NotImplementedError
+
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        raise NotImplementedError
+
+    def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]:
+        raise NotImplementedError
+
+    def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
+        raise NotImplementedError
+
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        raise NotImplementedError
+
+    def delete_node(self, node_id: str):
+        raise NotImplementedError
+
+    def reload(self):
+        raise NotImplementedError
diff --git a/graphgen/bases/base_tokenizer.py b/graphgen/bases/base_tokenizer.py
new file mode 100644
index 00000000..346d500b
--- /dev/null
+++ b/graphgen/bases/base_tokenizer.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class BaseTokenizer(ABC):
+    def __init__(self, model_name: str = "cl100k_base"):
+        self.model_name = model_name
+
+    @abstractmethod
+    def encode(self, text: str) -> List[int]:
+        """Encode text -> token ids."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def decode(self, token_ids: List[int]) -> str:
+        """Decode token ids -> text."""
+        raise NotImplementedError
+
+    def count_tokens(self, text: str) -> int:
+        return len(self.encode(text))
+
+    def chunk_by_token_size(
+        self,
+        content: str,
+        *,
+        overlap_token_size: int = 128,
+        max_token_size: int = 1024,
+    ) -> List[dict]:
+        tokens = self.encode(content)
+        results = []
+        step = max_token_size - overlap_token_size
+        for index, start in enumerate(range(0, len(tokens), step)):
+            chunk_ids = tokens[start : start + max_token_size]
+            results.append(
+                {
+                    "tokens": len(chunk_ids),
+                    "content": self.decode(chunk_ids).strip(),
+                    "chunk_order_index": index,
+                }
+            )
+        return results
diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py
new file mode 100644
index 00000000..df719fdf
--- /dev/null
+++ b/graphgen/bases/datatypes.py
@@ -0,0 +1,94 @@
+import math
+from dataclasses import dataclass, field
+from typing import List, Union
+
+from pydantic import BaseModel, Field, field_validator
+
+
+@dataclass
+class Chunk:
+    id: str
+    content: str
+    type: str
+    metadata: dict = field(default_factory=dict)
+
+    @staticmethod
+    def from_dict(key: str, data: dict) -> "Chunk":
+        return Chunk(
+            id=key,
+            content=data.get("content", ""),
+            type=data.get("type", "text"),
+            metadata={k: v for k, v in data.items() if k != "content"},
+        )
+
+
+@dataclass
+class QAPair:
+    """
+    A pair of question and answer.
+    """
+
+    question: str
+    answer: str
+
+
+@dataclass
+class Token:
+    text: str
+    prob: float
+    top_candidates: List = field(default_factory=list)
+    ppl: Union[float, None] = field(default=None)
+
+    @property
+    def logprob(self) -> float:
+        return math.log(self.prob)
+
+
+@dataclass
+class Community:
+    id: Union[int, str]
+    nodes: List[str] = field(default_factory=list)
+    edges: List[tuple] = field(default_factory=list)
+    metadata: dict = field(default_factory=dict)
+
+
+class Node(BaseModel):
+    id: str = Field(..., description="unique node id")
+    op_name: str = Field(..., description="operator name")
+    type: str = Field(
+        ..., description="task type, e.g., map, filter, flatmap, aggregate, map_batch"
+    )
+    params: dict = Field(default_factory=dict, description="operator parameters")
+    dependencies: List[str] = Field(
+        default_factory=list, description="list of dependent node ids"
+    )
+    execution_params: dict = Field(
+        default_factory=dict, description="execution parameters like replicas, batch_size"
+    )
+
+    @classmethod
+    @field_validator("type")
+    def validate_type(cls, v: str) -> str:
+        valid_types = {"map", "filter", "flatmap", "aggregate", "map_batch"}
+        if v not in valid_types:
+            raise ValueError(f"Invalid node type: {v}. Must be one of {valid_types}.")
+        return v
+
+
+class Config(BaseModel):
+    global_params: dict = Field(
+        default_factory=dict, description="global context for the computation graph"
+    )
+
+    nodes: List[Node] = Field(
+        ..., min_length=1, description="list of nodes in the computation graph"
+    )
+
+    @classmethod
+    @field_validator("nodes")
+    def validate_unique_ids(cls, v: List[Node]) -> List[Node]:
+        ids = [node.id for node in v]
+        if len(ids) != len(set(ids)):
+            duplicates = {id_ for id_ in ids if ids.count(id_) > 1}
+            raise ValueError(f"Duplicate node ids found: {duplicates}")
+        return v
diff --git a/graphgen/common/__init__.py b/graphgen/common/__init__.py
new file mode 100644
index 00000000..deb99459
--- /dev/null
+++ b/graphgen/common/__init__.py
@@ -0,0 +1,2 @@
+from .init_llm import init_llm
+from .init_storage import init_storage
diff --git a/graphgen/common/init_llm.py b/graphgen/common/init_llm.py
new file mode 100644
index 00000000..af53709a
--- /dev/null
+++ b/graphgen/common/init_llm.py
@@ -0,0 +1,177 @@
+import os
+from typing import Any, Dict, Optional
+
+import ray
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.common.init_storage import get_actor_handle
+from graphgen.models import Tokenizer
+
+
+class LLMServiceActor:
+    """
+    A Ray actor class to wrap LLM wrapper instances for distributed usage.
+    """
+
+    def __init__(self, backend: str, config: Dict[str, Any]):
+        self.backend = backend
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        tokenizer = Tokenizer(model_name=tokenizer_model)
+        config["tokenizer"] = tokenizer
+
+        if backend == "http_api":
+            from graphgen.models.llm.api.http_client import HTTPClient
+
+            self.llm_instance = HTTPClient(**config)
+        elif backend in ("openai_api", "azure_openai_api"):
+            from graphgen.models.llm.api.openai_client import OpenAIClient
+
+            # pass in concrete backend to the OpenAIClient so that internally we can distinguish
+            # between OpenAI and Azure OpenAI
+            self.llm_instance = OpenAIClient(**config, backend=backend)
+        elif backend == "ollama_api":
+            from graphgen.models.llm.api.ollama_client import OllamaClient
+
+            self.llm_instance = OllamaClient(**config)
+        elif backend == "huggingface":
+            from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper
+
+            self.llm_instance = HuggingFaceWrapper(**config)
+        elif backend == "sglang":
+            from graphgen.models.llm.local.sglang_wrapper import SGLangWrapper
+
+            self.llm_instance = SGLangWrapper(**config)
+
+        elif backend == "vllm":
+            from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper
+
+            self.llm_instance = VLLMWrapper(**config)
+        else:
+            raise NotImplementedError(f"Backend {backend} is not implemented yet.")
+
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        return await self.llm_instance.generate_answer(text, history, **extra)
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_topk_per_token(text, history, **extra)
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_inputs_prob(text, history, **extra)
+
+    def ready(self) -> bool:
+        """A simple method to check if the actor is ready."""
+        return True
+
+
+class LLMServiceProxy(BaseLLMWrapper):
+    """
+    A proxy class to interact with the LLMServiceActor for distributed LLM operations.
+    """
+
+    def __init__(self, actor_name: str):
+        super().__init__()
+        self.actor_handle = get_actor_handle(actor_name)
+        self._create_local_tokenizer()
+
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        object_ref = self.actor_handle.generate_answer.remote(text, history, **extra)
+        return await object_ref
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_topk_per_token.remote(
+            text, history, **extra
+        )
+        return await object_ref
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_inputs_prob.remote(
+            text, history, **extra
+        )
+        return await object_ref
+
+    def _create_local_tokenizer(self):
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        self.tokenizer = Tokenizer(model_name=tokenizer_model)
+
+
+class LLMFactory:
+    """
+    A factory class to create LLM wrapper instances based on the specified backend.
+    Supported backends include:
+    - http_api: HTTPClient
+    - openai_api: OpenAIClient
+    - ollama_api: OllamaClient
+    - huggingface: HuggingFaceWrapper
+    - sglang: SGLangWrapper
+    """
+
+    @staticmethod
+    def create_llm(
+        model_type: str, backend: str, config: Dict[str, Any]
+    ) -> BaseLLMWrapper:
+        if not config:
+            raise ValueError(
+                f"No configuration provided for LLM {model_type} with backend {backend}."
+            )
+
+        actor_name = f"Actor_LLM_{model_type}"
+        try:
+            ray.get_actor(actor_name)
+        except ValueError:
+            print(f"Creating Ray actor for LLM {model_type} with backend {backend}.")
+            num_gpus = float(config.pop("num_gpus", 0))
+            actor = (
+                ray.remote(LLMServiceActor)
+                .options(
+                    name=actor_name,
+                    num_gpus=num_gpus,
+                    lifetime="detached",
+                    get_if_exists=True,
+                )
+                .remote(backend, config)
+            )
+
+            # wait for actor to be ready
+            ray.get(actor.ready.remote())
+
+        return LLMServiceProxy(actor_name)
+
+
+def _load_env_group(prefix: str) -> Dict[str, Any]:
+    """
+    Collect environment variables with the given prefix into a dictionary,
+    stripping the prefix from the keys.
+    """
+    return {
+        k[len(prefix) :].lower(): v
+        for k, v in os.environ.items()
+        if k.startswith(prefix)
+    }
+
+
+def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
+    if model_type == "synthesizer":
+        prefix = "SYNTHESIZER_"
+    elif model_type == "trainee":
+        prefix = "TRAINEE_"
+    else:
+        raise NotImplementedError(f"Model type {model_type} is not implemented yet.")
+    config = _load_env_group(prefix)
+    # if config is empty, return None
+    if not config:
+        return None
+    backend = config.pop("backend")
+    llm_wrapper = LLMFactory.create_llm(model_type, backend, config)
+    return llm_wrapper
diff --git a/graphgen/common/init_storage.py b/graphgen/common/init_storage.py
new file mode 100644
index 00000000..b9358485
--- /dev/null
+++ b/graphgen/common/init_storage.py
@@ -0,0 +1,262 @@
+from typing import Any, Dict, Union
+
+import ray
+
+from graphgen.bases.base_storage import BaseGraphStorage, BaseKVStorage
+
+
+class KVStorageActor:
+    def __init__(self, backend: str, working_dir: str, namespace: str):
+        if backend == "json_kv":
+            from graphgen.models import JsonKVStorage
+
+            self.kv = JsonKVStorage(working_dir, namespace)
+        elif backend == "rocksdb":
+            from graphgen.models import RocksDBKVStorage
+
+            self.kv = RocksDBKVStorage(working_dir, namespace)
+        else:
+            raise ValueError(f"Unknown KV backend: {backend}")
+
+    def data(self) -> Dict[str, Dict]:
+        return self.kv.data
+
+    def all_keys(self) -> list[str]:
+        return self.kv.all_keys()
+
+    def index_done_callback(self):
+        return self.kv.index_done_callback()
+
+    def get_by_id(self, id: str) -> Dict:
+        return self.kv.get_by_id(id)
+
+    def get_by_ids(self, ids: list[str], fields=None) -> list:
+        return self.kv.get_by_ids(ids, fields)
+
+    def get_all(self) -> Dict[str, Dict]:
+        return self.kv.get_all()
+
+    def filter_keys(self, data: list[str]) -> set[str]:
+        return self.kv.filter_keys(data)
+
+    def upsert(self, data: dict) -> dict:
+        return self.kv.upsert(data)
+
+    def drop(self):
+        return self.kv.drop()
+
+    def reload(self):
+        return self.kv.reload()
+
+
+class GraphStorageActor:
+    def __init__(self, backend: str, working_dir: str, namespace: str):
+        if backend == "networkx":
+            from graphgen.models import NetworkXStorage
+
+            self.graph = NetworkXStorage(working_dir, namespace)
+        elif backend == "kuzu":
+            from graphgen.models import KuzuStorage
+
+            self.graph = KuzuStorage(working_dir, namespace)
+        else:
+            raise ValueError(f"Unknown Graph backend: {backend}")
+
+    def index_done_callback(self):
+        return self.graph.index_done_callback()
+
+    def has_node(self, node_id: str) -> bool:
+        return self.graph.has_node(node_id)
+
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        return self.graph.has_edge(source_node_id, target_node_id)
+
+    def node_degree(self, node_id: str) -> int:
+        return self.graph.node_degree(node_id)
+
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return self.graph.edge_degree(src_id, tgt_id)
+
+    def get_node(self, node_id: str) -> Any:
+        return self.graph.get_node(node_id)
+
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        return self.graph.update_node(node_id, node_data)
+
+    def get_all_nodes(self) -> Any:
+        return self.graph.get_all_nodes()
+
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        return self.graph.get_edge(source_node_id, target_node_id)
+
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return self.graph.update_edge(source_node_id, target_node_id, edge_data)
+
+    def get_all_edges(self) -> Any:
+        return self.graph.get_all_edges()
+
+    def get_node_edges(self, source_node_id: str) -> Any:
+        return self.graph.get_node_edges(source_node_id)
+
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        return self.graph.upsert_node(node_id, node_data)
+
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return self.graph.upsert_edge(source_node_id, target_node_id, edge_data)
+
+    def delete_node(self, node_id: str):
+        return self.graph.delete_node(node_id)
+
+    def reload(self):
+        return self.graph.reload()
+
+
+def get_actor_handle(name: str):
+    try:
+        return ray.get_actor(name)
+    except ValueError as exc:
+        raise RuntimeError(
+            f"Actor {name} not found. Make sure it is created before accessing."
+        ) from exc
+
+
+class RemoteKVStorageProxy(BaseKVStorage):
+    def __init__(self, namespace: str):
+        super().__init__()
+        self.namespace = namespace
+        self.actor_name = f"Actor_KV_{namespace}"
+        self.actor = get_actor_handle(self.actor_name)
+
+    def data(self) -> Dict[str, Any]:
+        return ray.get(self.actor.data.remote())
+
+    def all_keys(self) -> list[str]:
+        return ray.get(self.actor.all_keys.remote())
+
+    def index_done_callback(self):
+        return ray.get(self.actor.index_done_callback.remote())
+
+    def get_by_id(self, id: str) -> Union[Any, None]:
+        return ray.get(self.actor.get_by_id.remote(id))
+
+    def get_by_ids(self, ids: list[str], fields=None) -> list[Any]:
+        return ray.get(self.actor.get_by_ids.remote(ids, fields))
+
+    def get_all(self) -> Dict[str, Any]:
+        return ray.get(self.actor.get_all.remote())
+
+    def filter_keys(self, data: list[str]) -> set[str]:
+        return ray.get(self.actor.filter_keys.remote(data))
+
+    def upsert(self, data: Dict[str, Any]):
+        return ray.get(self.actor.upsert.remote(data))
+
+    def drop(self):
+        return ray.get(self.actor.drop.remote())
+
+    def reload(self):
+        return ray.get(self.actor.reload.remote())
+
+
+class RemoteGraphStorageProxy(BaseGraphStorage):
+    def __init__(self, namespace: str):
+        super().__init__()
+        self.namespace = namespace
+        self.actor_name = f"Actor_Graph_{namespace}"
+        self.actor = get_actor_handle(self.actor_name)
+
+    def index_done_callback(self):
+        return ray.get(self.actor.index_done_callback.remote())
+
+    def has_node(self, node_id: str) -> bool:
+        return ray.get(self.actor.has_node.remote(node_id))
+
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        return ray.get(self.actor.has_edge.remote(source_node_id, target_node_id))
+
+    def node_degree(self, node_id: str) -> int:
+        return ray.get(self.actor.node_degree.remote(node_id))
+
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return ray.get(self.actor.edge_degree.remote(src_id, tgt_id))
+
+    def get_node(self, node_id: str) -> Any:
+        return ray.get(self.actor.get_node.remote(node_id))
+
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        return ray.get(self.actor.update_node.remote(node_id, node_data))
+
+    def get_all_nodes(self) -> Any:
+        return ray.get(self.actor.get_all_nodes.remote())
+
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        return ray.get(self.actor.get_edge.remote(source_node_id, target_node_id))
+
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return ray.get(
+            self.actor.update_edge.remote(source_node_id, target_node_id, edge_data)
+        )
+
+    def get_all_edges(self) -> Any:
+        return ray.get(self.actor.get_all_edges.remote())
+
+    def get_node_edges(self, source_node_id: str) -> Any:
+        return ray.get(self.actor.get_node_edges.remote(source_node_id))
+
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        return ray.get(self.actor.upsert_node.remote(node_id, node_data))
+
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return ray.get(
+            self.actor.upsert_edge.remote(source_node_id, target_node_id, edge_data)
+        )
+
+    def delete_node(self, node_id: str):
+        return ray.get(self.actor.delete_node.remote(node_id))
+
+    def reload(self):
+        return ray.get(self.actor.reload.remote())
+
+
+class StorageFactory:
+    """
+    Factory class to create storage instances based on backend.
+    """
+
+    @staticmethod
+    def create_storage(backend: str, working_dir: str, namespace: str):
+        if backend in ["json_kv", "rocksdb"]:
+            actor_name = f"Actor_KV_{namespace}"
+            try:
+                ray.get_actor(actor_name)
+            except ValueError:
+                ray.remote(KVStorageActor).options(
+                    name=actor_name,
+                    lifetime="detached",
+                    get_if_exists=True,
+                ).remote(backend, working_dir, namespace)
+            return RemoteKVStorageProxy(namespace)
+        if backend in ["networkx", "kuzu"]:
+            actor_name = f"Actor_Graph_{namespace}"
+            try:
+                ray.get_actor(actor_name)
+            except ValueError:
+                ray.remote(GraphStorageActor).options(
+                    name=actor_name,
+                    lifetime="detached",
+                    get_if_exists=True,
+                ).remote(backend, working_dir, namespace)
+            return RemoteGraphStorageProxy(namespace)
+        raise ValueError(f"Unknown storage backend: {backend}")
+
+
+def init_storage(backend: str, working_dir: str, namespace: str):
+    return StorageFactory.create_storage(backend, working_dir, namespace)
diff --git a/graphgen/configs/config.yaml.example b/graphgen/configs/config.yaml.example
deleted file mode 100644
index eeb804af..00000000
--- a/graphgen/configs/config.yaml.example
+++ /dev/null
@@ -1,16 +0,0 @@
-data_type: raw
-input_file: resources/examples/raw_demo.jsonl
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  qa_form: atomic
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_tokens
-  isolated_node_strategy: add
-  max_depth: 2
-  max_extra_edges: 5
-  max_tokens: 256
-  loss_strategy: only_edge
-web_search: false
-re_judge: false
diff --git a/graphgen/configs/graphgen_config.yaml b/graphgen/configs/graphgen_config.yaml
deleted file mode 100644
index 4ddb66c7..00000000
--- a/graphgen/configs/graphgen_config.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-data_type: raw
-input_file: resources/examples/raw_demo.jsonl
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  qa_form: aggregated
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_width
-  isolated_node_strategy: ignore
-  max_depth: 1
-  max_extra_edges: 2
-  max_tokens: 256
-  loss_strategy: only_edge
-web_search: false
-re_judge: false
diff --git a/graphgen/engine.py b/graphgen/engine.py
new file mode 100644
index 00000000..63f35689
--- /dev/null
+++ b/graphgen/engine.py
@@ -0,0 +1,225 @@
+import inspect
+import logging
+import os
+from collections import defaultdict, deque
+from functools import wraps
+from typing import Any, Callable, Dict, List, Set
+
+import ray
+import ray.data
+
+from graphgen.bases import Config, Node
+from graphgen.utils import logger
+
+
+class Engine:
+    def __init__(
+        self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs
+    ):
+        self.config = Config(**config)
+        self.global_params = self.config.global_params
+        self.functions = functions
+        self.datasets: Dict[str, ray.data.Dataset] = {}
+
+        # Disable Ray Data progress bars and verbose output
+        os.environ.setdefault("RAY_DATA_DISABLE_PROGRESS_BARS", "1")
+        # Disable metrics exporter to avoid RpcError
+        os.environ.setdefault("RAY_DISABLE_IMPORTANT_WARNING", "1")
+        try:
+            from ray.data import DataContext
+            ctx = DataContext.get_current()
+            ctx.enable_rich_progress_bars = False
+            ctx.use_ray_tqdm = False
+        except Exception:
+            pass  # Ray Data context might not be available
+
+        if not ray.is_initialized():
+            # Disable metrics exporter to avoid RpcError
+            ray_init_kwargs.setdefault("_metrics_export_port", 0)
+            context = ray.init(
+                ignore_reinit_error=True,
+                logging_level=logging.ERROR,
+                log_to_driver=False,  # Disable Ray logs to driver
+                **ray_init_kwargs,
+            )
+            logger.info("Ray Dashboard URL: %s", context.dashboard_url)
+
+    @staticmethod
+    def _topo_sort(nodes: List[Node]) -> List[Node]:
+        id_to_node: Dict[str, Node] = {}
+        for n in nodes:
+            id_to_node[n.id] = n
+
+        indeg: Dict[str, int] = {nid: 0 for nid in id_to_node}
+        adj: Dict[str, List[str]] = defaultdict(list)
+
+        for n in nodes:
+            nid = n.id
+            deps: List[str] = n.dependencies
+            uniq_deps: Set[str] = set(deps)
+            for d in uniq_deps:
+                if d not in id_to_node:
+                    raise ValueError(
+                        f"The dependency node id {d} of node {nid} is not defined in the configuration."
+                    )
+                indeg[nid] += 1
+                adj[d].append(nid)
+
+        zero_deg: deque = deque(
+            [id_to_node[nid] for nid, deg in indeg.items() if deg == 0]
+        )
+        sorted_nodes: List[Node] = []
+
+        while zero_deg:
+            cur = zero_deg.popleft()
+            sorted_nodes.append(cur)
+            cur_id = cur.id
+            for nb_id in adj.get(cur_id, []):
+                indeg[nb_id] -= 1
+                if indeg[nb_id] == 0:
+                    zero_deg.append(id_to_node[nb_id])
+
+        if len(sorted_nodes) != len(nodes):
+            remaining = [nid for nid, deg in indeg.items() if deg > 0]
+            raise ValueError(
+                f"The configuration contains cycles, unable to execute. Remaining nodes with indegree > 0: {remaining}"
+            )
+
+        return sorted_nodes
+
+    def _get_input_dataset(
+        self, node: Node, initial_ds: ray.data.Dataset
+    ) -> ray.data.Dataset:
+        deps = node.dependencies
+
+        if not deps:
+            return initial_ds
+
+        if len(deps) == 1:
+            return self.datasets[deps[0]]
+
+        main_ds = self.datasets[deps[0]]
+        other_dss = [self.datasets[d] for d in deps[1:]]
+        return main_ds.union(*other_dss)
+
+    def _execute_node(self, node: Node, initial_ds: ray.data.Dataset):
+        def _filter_kwargs(
+            func_or_class: Callable,
+            global_params: Dict[str, Any],
+            func_params: Dict[str, Any],
+        ) -> Dict[str, Any]:
+            """
+            1. global_params: only when specified in function signature, will be passed
+            2. func_params: pass specified params first, then **kwargs if exists
+            """
+            try:
+                sig = inspect.signature(func_or_class)
+            except ValueError:
+                return {}
+
+            params = sig.parameters
+            final_kwargs = {}
+
+            has_var_keywords = any(
+                p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+            )
+            valid_keys = set(params.keys())
+            for k, v in global_params.items():
+                if k in valid_keys:
+                    final_kwargs[k] = v
+
+            for k, v in func_params.items():
+                if k in valid_keys or has_var_keywords:
+                    final_kwargs[k] = v
+            return final_kwargs
+
+        if node.op_name not in self.functions:
+            raise ValueError(f"Operator {node.op_name} not found for node {node.id}")
+
+        op_handler = self.functions[node.op_name]
+        node_params = _filter_kwargs(op_handler, self.global_params, node.params or {})
+
+        if node.type == "source":
+            self.datasets[node.id] = op_handler(**node_params)
+            return
+
+        input_ds = self._get_input_dataset(node, initial_ds)
+
+        if inspect.isclass(op_handler):
+            execution_params = node.execution_params or {}
+            replicas = execution_params.get("replicas", 1)
+            batch_size = (
+                int(execution_params.get("batch_size"))
+                if "batch_size" in execution_params
+                else "default"
+            )
+            compute_resources = execution_params.get("compute_resources", {})
+
+            if node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=1),
+                    batch_size=None,  # aggregate processes the whole dataset at once
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
+            else:
+                # others like map, filter, flatmap, map_batch let actors process data inside batches
+                self.datasets[node.id] = input_ds.map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=replicas),
+                    batch_size=batch_size,
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
+
+        else:
+
+            @wraps(op_handler)
+            def func_wrapper(row_or_batch: Dict[str, Any]) -> Dict[str, Any]:
+                return op_handler(row_or_batch, **node_params)
+
+            if node.type == "map":
+                self.datasets[node.id] = input_ds.map(func_wrapper)
+            elif node.type == "filter":
+                self.datasets[node.id] = input_ds.filter(func_wrapper)
+            elif node.type == "flatmap":
+                self.datasets[node.id] = input_ds.flat_map(func_wrapper)
+            elif node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    func_wrapper, batch_format="default"
+                )
+            elif node.type == "map_batch":
+                self.datasets[node.id] = input_ds.map_batches(func_wrapper)
+            else:
+                raise ValueError(
+                    f"Unsupported node type {node.type} for node {node.id}"
+                )
+
+    @staticmethod
+    def _find_leaf_nodes(nodes: List[Node]) -> Set[str]:
+        all_ids = {n.id for n in nodes}
+        deps_set = set()
+        for n in nodes:
+            deps_set.update(n.dependencies)
+        return all_ids - deps_set
+
+    def execute(self, initial_ds: ray.data.Dataset) -> Dict[str, ray.data.Dataset]:
+        sorted_nodes = self._topo_sort(self.config.nodes)
+
+        for node in sorted_nodes:
+            self._execute_node(node, initial_ds)
+
+        leaf_nodes = self._find_leaf_nodes(sorted_nodes)
+
+        @ray.remote
+        def _fetch_result(ds: ray.data.Dataset) -> List[Any]:
+            return ds.take_all()
+
+        return {node_id: self.datasets[node_id] for node_id in leaf_nodes}
diff --git a/graphgen/evaluate.py b/graphgen/evaluate.py
deleted file mode 100644
index da74a308..00000000
--- a/graphgen/evaluate.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""Evaluate the quality of the generated text using various metrics"""
-
-import os
-import json
-import argparse
-import pandas as pd
-from dotenv import load_dotenv
-from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
-from .utils import logger, set_logger
-
-sys_path = os.path.abspath(os.path.dirname(__file__))
-set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
-
-load_dotenv()
-
-def evaluate_length(corpus, tokenizer_name):
-    length_evaluator = LengthEvaluator(
-        tokenizer_name=tokenizer_name
-    )
-    logger.info("Length evaluator loaded")
-    scores = length_evaluator.get_average_score(corpus)
-    logger.info("Length scores: %s", scores)
-    return scores
-
-def evaluate_mtld(corpus):
-    mtld_evaluator = MTLDEvaluator()
-    logger.info("MTLD evaluator loaded")
-    scores = mtld_evaluator.get_average_score(corpus)
-    logger.info("MTLD scores: %s", scores)
-    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
-    logger.info("MTLD min max scores: %s", min_max_scores)
-    return scores, min_max_scores
-
-def evaluate_reward(corpus, reward_model_names):
-    scores = []
-    for reward_name in reward_model_names:
-        reward_evaluator = RewardEvaluator(
-            reward_name=reward_name
-        )
-        logger.info("Loaded reward model: %s", reward_name)
-        average_score = reward_evaluator.get_average_score(corpus)
-        logger.info("%s scores: %s", reward_name, average_score)
-        min_max_scores = reward_evaluator.get_min_max_score(corpus)
-        logger.info("%s min max scores: %s", reward_name, min_max_scores)
-        scores.append({
-            'reward_name': reward_name.split('/')[-1],
-            'score': average_score,
-            'min_max_scores': min_max_scores
-        })
-        del reward_evaluator
-        clean_gpu_cache()
-    return scores
-
-def evaluate_uni(corpus, uni_model_name):
-    uni_evaluator = UniEvaluator(
-        model_name=uni_model_name
-    )
-    logger.info("Uni evaluator loaded with model %s", uni_model_name)
-    uni_scores = uni_evaluator.get_average_score(corpus)
-    for key, value in uni_scores.items():
-        logger.info("Uni %s scores: %s", key, value)
-    min_max_scores = uni_evaluator.get_min_max_score(corpus)
-    for key, value in min_max_scores.items():
-        logger.info("Uni %s min max scores: %s", key, value)
-    del uni_evaluator
-    clean_gpu_cache()
-    return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
-            min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
-
-
-def clean_gpu_cache():
-    import torch
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-
-
-if __name__ == '__main__':
-    import torch.multiprocessing as mp
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
-    parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
-
-    parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
-    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
-                        help='Comma-separated list of reward models')
-    parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
-
-    args = parser.parse_args()
-
-    if not os.path.exists(args.folder):
-        raise ValueError(f"Folder {args.folder} does not exist")
-
-    if not os.path.exists(args.output):
-        os.makedirs(args.output)
-
-    reward_models = args.reward.split(',')
-
-
-    results = []
-
-    logger.info("Data loaded from %s", args.folder)
-    mp.set_start_method('spawn')
-
-    for file in os.listdir(args.folder):
-        if file.endswith('.json'):
-            logger.info("Processing %s", file)
-            with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            data = [TextPair(
-                question=data[key]['question'],
-                answer=data[key]['answer']
-            ) for key in data]
-
-            length_scores = evaluate_length(data, args.tokenizer)
-            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
-            reward_scores = evaluate_reward(data, reward_models)
-            uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
-            min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
-                = evaluate_uni(data, args.uni)
-
-            result = {
-                'file': file,
-                'number': len(data),
-                'length': length_scores,
-                'mtld': mtld_scores,
-                'mtld_min_max': min_max_mtld_scores,
-                'uni_naturalness': uni_naturalness_scores,
-                'uni_coherence': uni_coherence_scores,
-                'uni_understandability': uni_understandability_scores,
-                'uni_naturalness_min_max': min_max_uni_naturalness_scores,
-                'uni_coherence_min_max': min_max_uni_coherence_scores,
-                'uni_understandability_min_max': min_max_uni_understandability_scores
-            }
-            for reward_score in reward_scores:
-                result[reward_score['reward_name']] = reward_score['score']
-                result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
-
-            results.append(result)
-
-    results = pd.DataFrame(results)
-    results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)
diff --git a/graphgen/generate.py b/graphgen/generate.py
deleted file mode 100644
index 14693471..00000000
--- a/graphgen/generate.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import os
-import json
-import time
-import argparse
-from importlib.resources import files
-import yaml
-from dotenv import load_dotenv
-
-from .graphgen import GraphGen
-from .models import OpenAIModel, Tokenizer, TraverseStrategy
-from .utils import set_logger
-
-sys_path = os.path.abspath(os.path.dirname(__file__))
-
-load_dotenv()
-
-def set_working_dir(folder):
-    os.makedirs(folder, exist_ok=True)
-    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
-    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
-
-def save_config(config_path, global_config):
-    if not os.path.exists(os.path.dirname(config_path)):
-        os.makedirs(os.path.dirname(config_path))
-    with open(config_path, "w", encoding='utf-8') as config_file:
-        yaml.dump(global_config, config_file, default_flow_style=False, allow_unicode=True)
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config_file',
-                        help='Config parameters for GraphGen.',
-                        # default=os.path.join(sys_path, "configs", "graphgen_config.yaml"),
-                        default=files('graphgen').joinpath("configs", "graphgen_config.yaml"),
-                        type=str)
-    parser.add_argument('--output_dir',
-                        help='Output directory for GraphGen.',
-                        default=sys_path,
-                        required=True,
-                        type=str)
-
-    args = parser.parse_args()
-
-    working_dir = args.output_dir
-    set_working_dir(working_dir)
-    unique_id = int(time.time())
-    set_logger(os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), if_stream=False)
-
-    with open(args.config_file, "r", encoding='utf-8') as f:
-        config = yaml.load(f, Loader=yaml.FullLoader)
-
-    input_file = config['input_file']
-
-    if config['data_type'] == 'raw':
-        with open(input_file, "r", encoding='utf-8') as f:
-            data = [json.loads(line) for line in f]
-    elif config['data_type'] == 'chunked':
-        with open(input_file, "r", encoding='utf-8') as f:
-            data = json.load(f)
-    else:
-        raise ValueError(f"Invalid data type: {config['data_type']}")
-
-    synthesizer_llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
-        api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
-    )
-    trainee_llm_client = OpenAIModel(
-        model_name=os.getenv("TRAINEE_MODEL"),
-        api_key=os.getenv("TRAINEE_API_KEY"),
-        base_url=os.getenv("TRAINEE_BASE_URL")
-    )
-
-    traverse_strategy = TraverseStrategy(
-        **config['traverse_strategy']
-    )
-
-    graph_gen = GraphGen(
-        working_dir=working_dir,
-        unique_id=unique_id,
-        synthesizer_llm_client=synthesizer_llm_client,
-        trainee_llm_client=trainee_llm_client,
-        if_web_search=config['web_search'],
-        tokenizer_instance=Tokenizer(
-            model_name=config['tokenizer']
-        ),
-        traverse_strategy=traverse_strategy
-    )
-
-    graph_gen.insert(data, config['data_type'])
-
-    graph_gen.quiz(max_samples=config['quiz_samples'])
-
-    graph_gen.judge(re_judge=config["re_judge"])
-
-    graph_gen.traverse()
-
-    path = os.path.join(working_dir, "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml")
-    save_config(path, config)
-
-if __name__ == '__main__':
-    main()
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
deleted file mode 100644
index 265d32a9..00000000
--- a/graphgen/graphgen.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Adapt from https://github.com/HKUDS/LightRAG
-
-import asyncio
-import os
-import time
-from dataclasses import dataclass, field
-from typing import List, Union, cast
-
-import gradio as gr
-from tqdm.asyncio import tqdm as tqdm_async
-
-from .models import (
-    Chunk,
-    JsonKVStorage,
-    NetworkXStorage,
-    OpenAIModel,
-    Tokenizer,
-    TraverseStrategy,
-    WikiSearch,
-)
-from .models.storage.base_storage import StorageNameSpace
-from .operators import (
-    extract_kg,
-    judge_statement,
-    quiz,
-    search_wikipedia,
-    skip_judge_statement,
-    traverse_graph_atomically,
-    traverse_graph_by_edge,
-    traverse_graph_for_multi_hop,
-)
-from .utils import compute_content_hash, create_event_loop, logger
-
-sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-
-@dataclass
-class GraphGen:
-    unique_id: int = int(time.time())
-    working_dir: str = os.path.join(sys_path, "cache")
-
-    # text chunking
-    chunk_size: int = 1024
-    chunk_overlap_size: int = 100
-
-    # llm
-    synthesizer_llm_client: OpenAIModel = None
-    trainee_llm_client: OpenAIModel = None
-    tokenizer_instance: Tokenizer = None
-
-    # web search
-    if_web_search: bool = False
-    wiki_client: WikiSearch = field(default_factory=WikiSearch)
-
-    # traverse strategy
-    traverse_strategy: TraverseStrategy = field(default_factory=TraverseStrategy)
-
-    # webui
-    progress_bar: gr.Progress = None
-
-    def __post_init__(self):
-        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="full_docs"
-        )
-        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="text_chunks"
-        )
-        self.wiki_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="wiki"
-        )
-        self.graph_storage: NetworkXStorage = NetworkXStorage(
-            self.working_dir, namespace="graph"
-        )
-        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="rephrase"
-        )
-        self.qa_storage: JsonKVStorage = JsonKVStorage(
-            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)), namespace=f"qa-{self.unique_id}"
-        )
-
-    async def async_split_chunks(self, data: Union[List[list], List[dict]], data_type: str) -> dict:
-        # TODO： 是否进行指代消解
-        if len(data) == 0:
-            return {}
-
-        new_docs = {}
-        inserting_chunks = {}
-        if data_type == "raw":
-            assert isinstance(data, list) and isinstance(data[0], dict)
-            # compute hash for each document
-            new_docs = {
-                compute_content_hash(doc['content'], prefix="doc-"): {'content': doc['content']} for doc in data
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-
-            cur_index = 1
-            doc_number = len(new_docs)
-            async for doc_key, doc in tqdm_async(
-                    new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
-                ):
-                chunks = {
-                    compute_content_hash(dp["content"], prefix="chunk-"): {
-                        **dp,
-                        'full_doc_id': doc_key
-                    } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"],
-                                                                            self.chunk_overlap_size, self.chunk_size)
-                }
-                inserting_chunks.update(chunks)
-
-                if self.progress_bar is not None:
-                    self.progress_bar(
-                        cur_index / doc_number, f"Chunking {doc_key}"
-                    )
-                    cur_index += 1
-
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
-            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
-        elif data_type == "chunked":
-            assert isinstance(data, list) and isinstance(data[0], list)
-            new_docs = {
-                compute_content_hash("".join(chunk['content']), prefix="doc-"): {'content': "".join(chunk['content'])}
-                for doc in data for chunk in doc
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-            async for doc in tqdm_async(data, desc="[1/4]Chunking documents", unit="doc"):
-                doc_str = "".join([chunk['content'] for chunk in doc])
-                for chunk in doc:
-                    chunk_key = compute_content_hash(chunk['content'], prefix="chunk-")
-                    inserting_chunks[chunk_key] = {
-                        **chunk,
-                        'full_doc_id': compute_content_hash(doc_str, prefix="doc-")
-                    }
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
-            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
-
-        await self.full_docs_storage.upsert(new_docs)
-        await self.text_chunks_storage.upsert(inserting_chunks)
-
-        return inserting_chunks
-
-    def insert(self, data: Union[List[list], List[dict]], data_type: str):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_insert(data, data_type))
-
-    async def async_insert(self, data: Union[List[list], List[dict]], data_type: str):
-        """
-
-        insert chunks into the graph
-        """
-
-        inserting_chunks = await self.async_split_chunks(data, data_type)
-
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
-
-        logger.info("[Entity and Relation Extraction]...")
-        _add_entities_and_relations = await extract_kg(
-            llm_client=self.synthesizer_llm_client,
-            kg_instance=self.graph_storage,
-            tokenizer_instance=self.tokenizer_instance,
-            chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()],
-            progress_bar = self.progress_bar,
-        )
-        if not _add_entities_and_relations:
-            logger.warning("No entities or relations extracted")
-            return
-
-        logger.info("[Wiki Search] is %s", 'enabled' if self.if_web_search else 'disabled')
-        if self.if_web_search:
-            logger.info("[Wiki Search]...")
-            _add_wiki_data = await search_wikipedia(
-                llm_client= self.synthesizer_llm_client,
-                wiki_search_client=self.wiki_client,
-                knowledge_graph_instance=_add_entities_and_relations
-            )
-            await self.wiki_storage.upsert(_add_wiki_data)
-
-        await self._insert_done()
-
-    async def _insert_done(self):
-        tasks = []
-        for storage_instance in [self.full_docs_storage, self.text_chunks_storage,
-                                 self.graph_storage, self.wiki_storage]:
-            if storage_instance is None:
-                continue
-            tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
-        await asyncio.gather(*tasks)
-
-    def quiz(self, max_samples=1):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_quiz(max_samples))
-
-    async def async_quiz(self, max_samples=1):
-        await quiz(self.synthesizer_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
-        await self.rephrase_storage.index_done_callback()
-
-    def judge(self, re_judge=False, skip=False):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_judge(re_judge, skip))
-
-    async def async_judge(self, re_judge=False, skip=False):
-        if skip:
-            _update_relations = await skip_judge_statement(self.graph_storage)
-        else:
-            _update_relations = await judge_statement(self.trainee_llm_client, self.graph_storage,
-                                                      self.rephrase_storage, re_judge)
-        await _update_relations.index_done_callback()
-
-    def traverse(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_traverse())
-
-    async def async_traverse(self):
-        if self.traverse_strategy.qa_form == "atomic":
-            results = await traverse_graph_atomically(self.synthesizer_llm_client,
-                                                      self.tokenizer_instance,
-                                                      self.graph_storage,
-                                                      self.traverse_strategy,
-                                                      self.text_chunks_storage,
-                                                      self.progress_bar)
-        elif self.traverse_strategy.qa_form == "multi_hop":
-            results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client,
-                                                            self.tokenizer_instance,
-                                                            self.graph_storage,
-                                                            self.traverse_strategy,
-                                                            self.text_chunks_storage,
-                                                            self.progress_bar)
-        elif self.traverse_strategy.qa_form == "aggregated":
-            results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
-                                                   self.graph_storage, self.traverse_strategy, self.text_chunks_storage,
-                                                   self.progress_bar)
-        else:
-            raise ValueError(f"Unknown qa_form: {self.traverse_strategy.qa_form}")
-        await self.qa_storage.upsert(results)
-        await self.qa_storage.index_done_callback()
-
-    def clear(self):
-        loop = create_event_loop()
-        loop.run_until_complete(self.async_clear())
-
-    async def async_clear(self):
-        await self.full_docs_storage.drop()
-        await self.text_chunks_storage.drop()
-        await self.wiki_storage.drop()
-        await self.graph_storage.clear()
-        await self.rephrase_storage.drop()
-        await self.qa_storage.drop()
-
-        logger.info("All caches are cleared")
diff --git a/graphgen/judge.py b/graphgen/judge.py
deleted file mode 100644
index f05bdf1d..00000000
--- a/graphgen/judge.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import argparse
-import asyncio
-from dotenv import load_dotenv
-
-from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
-from .operators import judge_statement
-
-sys_path = os.path.abspath(os.path.dirname(__file__))
-
-load_dotenv()
-
-def calculate_average_loss(graph: NetworkXStorage):
-    """
-    Calculate the average loss of the graph.
-
-    :param graph: NetworkXStorage
-    :return: float
-    """
-    edges = asyncio.run(graph.get_all_edges())
-    total_loss = 0
-    for edge in edges:
-        total_loss += edge[2]['loss']
-    return total_loss / len(edges)
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
-    parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
-
-    args = parser.parse_args()
-
-    llm_client = OpenAIModel(
-        model_name=os.getenv("TRAINEE_MODEL"),
-        api_key=os.getenv("TRAINEE_API_KEY"),
-        base_url=os.getenv("TRAINEE_BASE_URL")
-    )
-
-    graph_storage = NetworkXStorage(
-        args.input,
-        namespace="graph"
-    )
-    average_loss = calculate_average_loss(graph_storage)
-    print(f"Average loss of the graph: {average_loss}")
-
-    rephrase_storage = JsonKVStorage(
-        os.path.join(sys_path, "cache"),
-        namespace="rephrase"
-    )
-
-    new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
-
-    graph_file = asyncio.run(graph_storage.get_graph())
-
-    new_graph.write_nx_graph(graph_file, args.output)
-
-    average_loss = calculate_average_loss(new_graph)
-    print(f"Average loss of the graph: {average_loss}")
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
index c2f9e714..4606715b 100644
--- a/graphgen/models/__init__.py
+++ b/graphgen/models/__init__.py
@@ -1,41 +1,43 @@
-from .text.chunk import Chunk
-from .text.text_pair import TextPair
-
-from .llm.topk_token_model import Token, TopkTokenModel
-from .llm.openai_model import OpenAIModel
-from .llm.tokenizer import Tokenizer
-
-from .storage.networkx_storage import NetworkXStorage
-from .storage.json_storage import JsonKVStorage
-
-from .search.wiki_search import WikiSearch
-
-from .evaluate.length_evaluator import LengthEvaluator
-from .evaluate.mtld_evaluator import MTLDEvaluator
-from .evaluate.reward_evaluator import RewardEvaluator
-from .evaluate.uni_evaluator import UniEvaluator
-
-from .strategy.travserse_strategy import TraverseStrategy
-
-
-__all__ = [
-    # llm models
-    "OpenAIModel",
-    "TopkTokenModel",
-    "Token",
-    "Tokenizer",
-    # storage models
-    "Chunk",
-    "NetworkXStorage",
-    "JsonKVStorage",
-    # search models
-    "WikiSearch",
-    # evaluate models
-    "TextPair",
-    "LengthEvaluator",
-    "MTLDEvaluator",
-    "RewardEvaluator",
-    "UniEvaluator",
-    # strategy models
-    "TraverseStrategy",
-]
+from .evaluator import LengthEvaluator, MTLDEvaluator, RewardEvaluator, UniEvaluator
+from .generator import (
+    AggregatedGenerator,
+    AtomicGenerator,
+    CoTGenerator,
+    MultiHopGenerator,
+    OmicsQAGenerator,
+    QuizGenerator,
+    VQAGenerator,
+)
+from .kg_builder import LightRAGKGBuilder, MMKGBuilder, OmicsKGBuilder
+from .llm import HTTPClient, OllamaClient, OpenAIClient
+from .partitioner import (
+    AnchorBFSPartitioner,
+    BFSPartitioner,
+    DFSPartitioner,
+    ECEPartitioner,
+    LeidenPartitioner,
+)
+from .reader import (
+    CSVReader,
+    JSONReader,
+    ParquetReader,
+    PDFReader,
+    PickleReader,
+    RDFReader,
+    TXTReader,
+)
+from .searcher.db.ncbi_searcher import NCBISearch
+from .searcher.db.rnacentral_searcher import RNACentralSearch
+from .searcher.db.uniprot_searcher import UniProtSearch
+from .searcher.kg.wiki_search import WikiSearch
+from .searcher.web.bing_search import BingSearch
+from .searcher.web.google_search import GoogleSearch
+from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
+from .storage import (
+    JsonKVStorage,
+    KuzuStorage,
+    NetworkXStorage,
+    RocksDBCache,
+    RocksDBKVStorage,
+)
+from .tokenizer import Tokenizer
diff --git a/graphgen/models/embed/embedding.py b/graphgen/models/embed/embedding.py
deleted file mode 100644
index 8213b90f..00000000
--- a/graphgen/models/embed/embedding.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from dataclasses import dataclass
-import asyncio
-import numpy as np
-
-class UnlimitedSemaphore:
-    """A context manager that allows unlimited access."""
-
-    async def __aenter__(self):
-        pass
-
-    async def __aexit__(self, exc_type, exc, tb):
-        pass
-
-@dataclass
-class EmbeddingFunc:
-    embedding_dim: int
-    max_token_size: int
-    func: callable
-    concurrent_limit: int = 16
-
-    def __post_init__(self):
-        if self.concurrent_limit != 0:
-            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
-        else:
-            self._semaphore = UnlimitedSemaphore()
-
-    async def __call__(self, *args, **kwargs) -> np.ndarray:
-        async with self._semaphore:
-            return await self.func(*args, **kwargs)
diff --git a/graphgen/models/evaluate/length_evaluator.py b/graphgen/models/evaluate/length_evaluator.py
deleted file mode 100644
index ba53ff6b..00000000
--- a/graphgen/models/evaluate/length_evaluator.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from dataclasses import dataclass
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
-from graphgen.models.llm.tokenizer import Tokenizer
-from graphgen.models.text.text_pair import TextPair
-from graphgen.utils import create_event_loop
-
-
-@dataclass
-class LengthEvaluator(BaseEvaluator):
-    tokenizer_name: str = "cl100k_base"
-    def __post_init__(self):
-        self.tokenizer = Tokenizer(
-            model_name=self.tokenizer_name
-        )
-
-    async def evaluate_single(self, pair: TextPair) -> float:
-        loop = create_event_loop()
-        return await loop.run_in_executor(None, self._calculate_length, pair.answer)
-
-    def _calculate_length(self, text: str) -> float:
-        tokens = self.tokenizer.encode_string(text)
-        return len(tokens)
diff --git a/graphgen/models/evaluator/__init__.py b/graphgen/models/evaluator/__init__.py
new file mode 100644
index 00000000..a9b445b4
--- /dev/null
+++ b/graphgen/models/evaluator/__init__.py
@@ -0,0 +1,4 @@
+from .length_evaluator import LengthEvaluator
+from .mtld_evaluator import MTLDEvaluator
+from .reward_evaluator import RewardEvaluator
+from .uni_evaluator import UniEvaluator
diff --git a/graphgen/models/evaluate/base_evaluator.py b/graphgen/models/evaluator/base_evaluator.py
similarity index 55%
rename from graphgen/models/evaluate/base_evaluator.py
rename to graphgen/models/evaluator/base_evaluator.py
index 6c5ae2d5..e93ea262 100644
--- a/graphgen/models/evaluate/base_evaluator.py
+++ b/graphgen/models/evaluator/base_evaluator.py
@@ -1,22 +1,32 @@
 import asyncio
 
-from dataclasses import dataclass
 from tqdm.asyncio import tqdm as tqdm_async
+
+from graphgen.bases.datatypes import QAPair
 from graphgen.utils import create_event_loop
-from graphgen.models.text.text_pair import TextPair
 
-@dataclass
+
 class BaseEvaluator:
-    max_concurrent: int = 100
-    results: list[float] = None
+    def __init__(self, max_concurrent: int = 100):
+        self.max_concurrent = max_concurrent
+        self.results: list[float] = None
 
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
+    def evaluate(self, pairs: list[QAPair]) -> list[float]:
         """
         Evaluate the text and return a score.
         """
-        return create_event_loop().run_until_complete(self.async_evaluate(pairs))
-
-    async def async_evaluate(self, pairs: list[TextPair]) -> list[float]:
+        loop, created = create_event_loop()
+        try:
+            if loop.is_running():
+                raise RuntimeError(
+                    "Cannot use evaluate when event loop is already running."
+                )
+            return loop.run_until_complete(self.async_evaluate(pairs))
+        finally:
+            if created:
+                loop.close()
+
+    async def async_evaluate(self, pairs: list[QAPair]) -> list[float]:
         semaphore = asyncio.Semaphore(self.max_concurrent)
 
         async def evaluate_with_semaphore(pair):
@@ -31,10 +41,10 @@ async def evaluate_with_semaphore(pair):
             results.append(await result)
         return results
 
-    async def evaluate_single(self, pair: TextPair) -> float:
+    async def evaluate_single(self, pair: QAPair) -> float:
         raise NotImplementedError()
 
-    def get_average_score(self, pairs: list[TextPair]) -> float:
+    def get_average_score(self, pairs: list[QAPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
@@ -42,7 +52,7 @@ def get_average_score(self, pairs: list[TextPair]) -> float:
         self.results = results
         return sum(self.results) / len(pairs)
 
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
+    def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """
diff --git a/graphgen/models/evaluator/length_evaluator.py b/graphgen/models/evaluator/length_evaluator.py
new file mode 100644
index 00000000..9fbc6f3c
--- /dev/null
+++ b/graphgen/models/evaluator/length_evaluator.py
@@ -0,0 +1,21 @@
+import asyncio
+
+from graphgen.bases.datatypes import QAPair
+from graphgen.models.evaluator.base_evaluator import BaseEvaluator
+from graphgen.models.tokenizer import Tokenizer
+
+
+class LengthEvaluator(BaseEvaluator):
+    def __init__(self, tokenizer_name: str = "cl100k_base", max_concurrent: int = 100):
+        super().__init__(max_concurrent)
+        self.tokenizer_name = tokenizer_name
+        self.tokenizer = Tokenizer(model_name=self.tokenizer_name)
+
+    async def evaluate_single(self, pair: QAPair) -> float:
+        # In async context, we should use the running loop
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self._calculate_length, pair.answer)
+
+    def _calculate_length(self, text: str) -> float:
+        tokens = self.tokenizer.encode(text)
+        return len(tokens)
diff --git a/graphgen/models/evaluate/mtld_evaluator.py b/graphgen/models/evaluator/mtld_evaluator.py
similarity index 74%
rename from graphgen/models/evaluate/mtld_evaluator.py
rename to graphgen/models/evaluator/mtld_evaluator.py
index 4ea68875..3423425b 100644
--- a/graphgen/models/evaluate/mtld_evaluator.py
+++ b/graphgen/models/evaluator/mtld_evaluator.py
@@ -1,23 +1,26 @@
-from dataclasses import  dataclass, field
 from typing import Set
 
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
-from graphgen.models.text.text_pair import TextPair
-from graphgen.utils import detect_main_language, NLTKHelper, create_event_loop
-
+from graphgen.bases.datatypes import QAPair
+from graphgen.models.evaluator.base_evaluator import BaseEvaluator
+from graphgen.utils import NLTKHelper, detect_main_language
 
 nltk_helper = NLTKHelper()
 
-@dataclass
+
 class MTLDEvaluator(BaseEvaluator):
     """
     衡量文本词汇多样性的指标
     """
-    stopwords_en: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("english")))
-    stopwords_zh: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("chinese")))
 
-    async def evaluate_single(self, pair: TextPair) -> float:
-        loop = create_event_loop()
+    def __init__(self, max_concurrent: int = 100):
+        super().__init__(max_concurrent)
+        self.stopwords_en: Set[str] = set(nltk_helper.get_stopwords("english"))
+        self.stopwords_zh: Set[str] = set(nltk_helper.get_stopwords("chinese"))
+
+    async def evaluate_single(self, pair: QAPair) -> float:
+        # In async context, we should use the running loop
+        import asyncio
+        loop = asyncio.get_running_loop()
         return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer)
 
     def _calculate_mtld_score(self, text: str, threshold=0.72) -> float:
@@ -71,6 +74,6 @@ def _compute_factors(tokens: list, threshold: float) -> float:
             if ttr <= threshold:
                 factors += 1
             else:
-                factors += (1 - (ttr - threshold) / (1 - threshold))
+                factors += 1 - (ttr - threshold) / (1 - threshold)
 
         return len(tokens) / factors if factors > 0 else len(tokens)
diff --git a/graphgen/models/evaluate/reward_evaluator.py b/graphgen/models/evaluator/reward_evaluator.py
similarity index 89%
rename from graphgen/models/evaluate/reward_evaluator.py
rename to graphgen/models/evaluator/reward_evaluator.py
index 2e4c021c..4d2c2fb9 100644
--- a/graphgen/models/evaluate/reward_evaluator.py
+++ b/graphgen/models/evaluator/reward_evaluator.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
+
 from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
+
+from graphgen.bases.datatypes import QAPair
 
 
 @dataclass
@@ -9,19 +11,22 @@ class RewardEvaluator:
     Reward Model Evaluator.
     OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf]，越高越好
     """
+
     reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
     max_length: int = 2560
     results: list[float] = None
 
     def __post_init__(self):
         import torch
+
         self.num_gpus = torch.cuda.device_count()
 
     @staticmethod
     def process_chunk(rank, pairs, reward_name, max_length, return_dict):
         import torch
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        device = f'cuda:{rank}'
+
+        device = f"cuda:{rank}"
         torch.cuda.set_device(rank)
 
         rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
@@ -37,7 +42,7 @@ def process_chunk(rank, pairs, reward_name, max_length, return_dict):
                     pair.answer,
                     return_tensors="pt",
                     max_length=max_length,
-                    truncation=True
+                    truncation=True,
                 )
                 inputs = {k: v.to(device) for k, v in inputs.items()}
                 score = rank_model(**inputs).logits[0].item()
@@ -45,8 +50,9 @@ def process_chunk(rank, pairs, reward_name, max_length, return_dict):
 
         return_dict[rank] = results
 
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
+    def evaluate(self, pairs: list[QAPair]) -> list[float]:
         import torch.multiprocessing as mp
+
         chunk_size = len(pairs) // self.num_gpus
         chunks = []
         for i in range(self.num_gpus):
@@ -64,7 +70,7 @@ def evaluate(self, pairs: list[TextPair]) -> list[float]:
         for rank, chunk in enumerate(chunks):
             p = mp.Process(
                 target=self.process_chunk,
-                args=(rank, chunk, self.reward_name, self.max_length, return_dict)
+                args=(rank, chunk, self.reward_name, self.max_length, return_dict),
             )
             p.start()
             processes.append(p)
@@ -84,7 +90,7 @@ def evaluate(self, pairs: list[TextPair]) -> list[float]:
 
         return results
 
-    def get_average_score(self, pairs: list[TextPair]) -> float:
+    def get_average_score(self, pairs: list[QAPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
@@ -92,7 +98,7 @@ def get_average_score(self, pairs: list[TextPair]) -> float:
         self.results = results
         return sum(self.results) / len(pairs)
 
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
+    def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """
diff --git a/graphgen/models/evaluate/uni_evaluator.py b/graphgen/models/evaluator/uni_evaluator.py
similarity index 70%
rename from graphgen/models/evaluate/uni_evaluator.py
rename to graphgen/models/evaluator/uni_evaluator.py
index a334f0a9..20fa3517 100644
--- a/graphgen/models/evaluate/uni_evaluator.py
+++ b/graphgen/models/evaluator/uni_evaluator.py
@@ -1,40 +1,58 @@
 # https://github.com/maszhongming/UniEval/tree/main
 
 from dataclasses import dataclass, field
+
 from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
+
+from graphgen.bases.datatypes import QAPair
 
 
 def _add_questions(dimension: str, question: str, answer: str):
     if dimension == "naturalness":
-        cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer
+        cur_input = (
+            "question: Is this a natural response in the dialogue? </s> response: "
+            + answer
+        )
     elif dimension == "coherence":
-        cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \
-                    + answer + ' </s> dialogue history: ' + question
+        cur_input = (
+            "question: Is this a coherent response given the dialogue history? </s> response: "
+            + answer
+            + " </s> dialogue history: "
+            + question
+        )
     elif dimension == "understandability":
-        cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer
+        cur_input = (
+            "question: Is this an understandable response in the dialogue? </s> response: "
+            + answer
+        )
     else:
         raise NotImplementedError(
-            'The input format for this dimension is still undefined. Please customize it first.')
+            "The input format for this dimension is still undefined. Please customize it first."
+        )
     return cur_input
 
+
 @dataclass
 class UniEvaluator:
     model_name: str = "MingZhong/unieval-sum"
-    dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability'])
+    dimensions: list = field(
+        default_factory=lambda: ["naturalness", "coherence", "understandability"]
+    )
     max_length: int = 2560
     results: dict = None
 
     def __post_init__(self):
         import torch
+
         self.num_gpus = torch.cuda.device_count()
         self.results = {}
 
     @staticmethod
     def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
         import torch
-        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-        device = f'cuda:{rank}'
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+        device = f"cuda:{rank}"
         torch.cuda.set_device(rank)
 
         rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -59,26 +77,26 @@ def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
                     max_length=max_length,
                     truncation=True,
                     padding=True,
-                    return_tensors='pt'
+                    return_tensors="pt",
                 )
                 encoded_tgt = tokenizer(
                     tgt,
                     max_length=max_length,
                     truncation=True,
                     padding=True,
-                    return_tensors='pt'
+                    return_tensors="pt",
                 )
 
-                src_tokens = encoded_src['input_ids'].to(device)
-                src_mask = encoded_src['attention_mask'].to(device)
+                src_tokens = encoded_src["input_ids"].to(device)
+                src_mask = encoded_src["attention_mask"].to(device)
 
-                tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1)
+                tgt_tokens = encoded_tgt["input_ids"].to(device)[:, 0].unsqueeze(-1)
 
                 output = rank_model(
                     input_ids=src_tokens,
                     attention_mask=src_mask,
                     labels=tgt_tokens,
-                    use_cache = False
+                    use_cache=False,
                 )
 
                 logits = output.logits.view(-1, rank_model.config.vocab_size)
@@ -91,8 +109,9 @@ def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
 
         return_dict[rank] = results
 
-    def evaluate(self, pairs: list[TextPair]) -> list[dict]:
+    def evaluate(self, pairs: list[QAPair]) -> list[dict]:
         import torch.multiprocessing as mp
+
         final_results = []
         for dimension in self.dimensions:
             chunk_size = len(pairs) // self.num_gpus
@@ -112,7 +131,14 @@ def evaluate(self, pairs: list[TextPair]) -> list[dict]:
             for rank, chunk in enumerate(chunks):
                 p = mp.Process(
                     target=self.process_chunk,
-                    args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict)
+                    args=(
+                        rank,
+                        chunk,
+                        self.model_name,
+                        self.max_length,
+                        dimension,
+                        return_dict,
+                    ),
                 )
                 p.start()
                 processes.append(p)
@@ -130,12 +156,10 @@ def evaluate(self, pairs: list[TextPair]) -> list[dict]:
                     p.terminate()
                     p.join()
 
-            final_results.append({
-                dimension: results
-            })
+            final_results.append({dimension: results})
         return final_results
 
-    def get_average_score(self, pairs: list[TextPair]) -> dict:
+    def get_average_score(self, pairs: list[QAPair]) -> dict:
         """
         Get the average score of a batch of texts.
         """
@@ -147,7 +171,7 @@ def get_average_score(self, pairs: list[TextPair]) -> dict:
                 self.results[key] = value
         return final_results
 
-    def get_min_max_score(self, pairs: list[TextPair]) -> dict:
+    def get_min_max_score(self, pairs: list[QAPair]) -> dict:
         """
         Get the min and max score of a batch of texts.
         """
diff --git a/graphgen/models/extractor/__init__.py b/graphgen/models/extractor/__init__.py
new file mode 100644
index 00000000..49db9fc2
--- /dev/null
+++ b/graphgen/models/extractor/__init__.py
@@ -0,0 +1 @@
+from .schema_guided_extractor import SchemaGuidedExtractor
diff --git a/graphgen/models/extractor/key_information_extractor.py b/graphgen/models/extractor/key_information_extractor.py
new file mode 100644
index 00000000..1a894fee
--- /dev/null
+++ b/graphgen/models/extractor/key_information_extractor.py
@@ -0,0 +1 @@
+# TODO: text2json
diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py
new file mode 100644
index 00000000..74801946
--- /dev/null
+++ b/graphgen/models/extractor/schema_guided_extractor.py
@@ -0,0 +1,108 @@
+import json
+from typing import Dict, List
+
+from graphgen.bases import BaseExtractor, BaseLLMWrapper
+from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
+from graphgen.utils import compute_dict_hash, detect_main_language, logger
+
+
+class SchemaGuidedExtractor(BaseExtractor):
+    """
+    Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text.
+
+    Usage example:
+        schema = {
+                "type": "legal contract",
+                "description": "A legal contract for leasing property.",
+                "properties": {
+                    "end_date": {"type": "string", "description": "The end date of the lease."},
+                    "leased_space": {"type": "string", "description": "Description of the space that is being leased."},
+                    "lessee": {"type": "string", "description": "The lessee's name (and possibly address)."},
+                    "lessor": {"type": "string", "description": "The lessor's name (and possibly address)."},
+                    "signing_date": {"type": "string", "description": "The date the contract was signed."},
+                    "start_date": {"type": "string", "description": "The start date of the lease."},
+                    "term_of_payment": {"type": "string", "description": "Description of the payment terms."},
+                    "designated_use": {"type": "string",
+                    "description": "Description of the designated use of the property being leased."},
+                    "extension_period": {"type": "string",
+                    "description": "Description of the extension options for the lease."},
+                    "expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."}
+                },
+                "required": ["lessee", "lessor", "start_date", "end_date"]
+            }
+        extractor = SchemaGuidedExtractor(llm_client, schema)
+        result = extractor.extract(text)
+
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
+        super().__init__(llm_client)
+        self.schema = schema
+        self.required_keys = self.schema.get("required")
+        if not self.required_keys:
+            # If no required keys are specified, use all keys from the schema as default
+            self.required_keys = list(self.schema.get("properties", {}).keys())
+
+    def build_prompt(self, text: str) -> str:
+        schema_explanation = ""
+        for field, details in self.schema.get("properties", {}).items():
+            description = details.get("description", "No description provided.")
+            schema_explanation += f'- "{field}": {description}\n'
+
+        lang = detect_main_language(text)
+
+        prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format(
+            field=self.schema.get("name", "the document"),
+            schema_explanation=schema_explanation,
+            examples="",
+            text=text,
+        )
+        return prompt
+
+    async def extract(self, chunk: dict) -> dict:
+        _chunk_id = chunk.get("_chunk_id", "")
+        text = chunk.get("content", "")
+
+        prompt = self.build_prompt(text)
+        response = await self.llm_client.generate_answer(prompt)
+        try:
+            extracted_info = json.loads(response)
+            # Ensure all required keys are present
+            for key in self.required_keys:
+                if key not in extracted_info:
+                    extracted_info[key] = ""
+            if any(extracted_info[key] == "" for key in self.required_keys):
+                logger.debug("Missing required keys in extraction: %s", extracted_info)
+                return {}
+            main_keys_info = {key: extracted_info[key] for key in self.required_keys}
+            logger.debug("Extracted info: %s", extracted_info)
+
+            # add chunk metadata
+            extracted_info["_chunk_id"] = _chunk_id
+
+            return {
+                compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
+            }
+        except json.JSONDecodeError:
+            logger.error("Failed to parse extraction response: %s", response)
+            return {}
+
+    @staticmethod
+    def merge_extractions(extraction_list: List[Dict[str, dict]]) -> Dict[str, dict]:
+        """
+        Merge multiple extraction results based on their hashes.
+        :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
+        :return: Merged extraction results.
+        """
+        merged: Dict[str, dict] = {}
+        for ext in extraction_list:
+            for h, rec in ext.items():
+                if h not in merged:
+                    merged[h] = rec.copy()
+                else:
+                    for k, v in rec.items():
+                        if k not in merged[h] or merged[h][k] == v:
+                            merged[h][k] = v
+                        else:
+                            merged[h][k] = f"{merged[h][k]}<SEP>{v}"
+        return merged
diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py
new file mode 100644
index 00000000..ec41f5dc
--- /dev/null
+++ b/graphgen/models/generator/__init__.py
@@ -0,0 +1,7 @@
+from .aggregated_generator import AggregatedGenerator
+from .atomic_generator import AtomicGenerator
+from .cot_generator import CoTGenerator
+from .multi_hop_generator import MultiHopGenerator
+from .omics_qa_generator import OmicsQAGenerator
+from .quiz_generator import QuizGenerator
+from .vqa_generator import VQAGenerator
diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py
new file mode 100644
index 00000000..4bad8e99
--- /dev/null
+++ b/graphgen/models/generator/aggregated_generator.py
@@ -0,0 +1,125 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import AGGREGATED_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class AggregatedGenerator(BaseGenerator):
+    """
+    Aggregated Generator follows a TWO-STEP process:
+    1. rephrase: Rephrase the input nodes and edges into a coherent text that maintains the original meaning.
+                 The rephrased text is considered as answer to be used in the next step.
+    2. question generation: Generate relevant questions based on the rephrased text.
+    """
+
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for REPHRASE.
+        :param batch
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relations_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relations_str)
+
+        # TODO: configure add_context
+        #     if add_context:
+        #         original_ids = [
+        #             node["source_id"].split("<SEP>")[0] for node in _process_nodes
+        #         ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges]
+        #         original_ids = list(set(original_ids))
+        #         original_text = await text_chunks_storage.get_by_ids(original_ids)
+        #         original_text = "\n".join(
+        #             [
+        #                 f"{index + 1}. {text['content']}"
+        #                 for index, text in enumerate(original_text)
+        #             ]
+        #         )
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
+            entities=entities_str, relationships=relations_str
+        )
+        return prompt
+
+    @staticmethod
+    def parse_rephrased_text(response: str) -> str:
+        """
+        Parse the rephrased text from the response.
+        :param response:
+        :return: rephrased text
+        """
+        if "Rephrased Text:" in response:
+            rephrased_text = response.split("Rephrased Text:")[1].strip()
+        elif "重述文本:" in response:
+            rephrased_text = response.split("重述文本:")[1].strip()
+        else:
+            rephrased_text = response.strip()
+        return rephrased_text.strip('"')
+
+    @staticmethod
+    def _build_prompt_for_question_generation(answer: str) -> str:
+        """
+        Build prompts for QUESTION GENERATION.
+        :param answer:
+        :return:
+        """
+        language = detect_main_language(answer)
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["QUESTION_GENERATION"].format(
+            answer=answer
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if response.startswith("Question:"):
+            question = response[len("Question:") :].strip()
+        elif response.startswith("问题："):
+            question = response[len("问题：") :].strip()
+        else:
+            question = response.strip()
+        return {
+            "question": question,
+        }
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        rephrasing_prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(rephrasing_prompt)
+        context = self.parse_rephrased_text(response)
+        question_generation_prompt = self._build_prompt_for_question_generation(context)
+        response = await self.llm_client.generate_answer(question_generation_prompt)
+        question = self.parse_response(response)["question"]
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", context)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": context,
+            }
+        }
+        result.update(qa_pairs)
+        return result
diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py
new file mode 100644
index 00000000..713140d2
--- /dev/null
+++ b/graphgen/models/generator/atomic_generator.py
@@ -0,0 +1,50 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import ATOMIC_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class AtomicGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        context = ""
+        for node in nodes:
+            context += f"- {node[0]}: {node[1]['description']}\n"
+        for edge in edges:
+            context += f"- {edge[0]} - {edge[1]}: {edge[2]['description']}\n"
+        language = detect_main_language(context)
+
+        prompt = ATOMIC_GENERATION_PROMPT[language].format(context=context)
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        """
+        AtomicGenerator normally generates one QA pair per response.
+        So we just need to parse one QA pair from the response.
+        :param response:
+        :return:
+        """
+        if "Question:" in response and "Answer:" in response:
+            question = response.split("Question:")[1].split("Answer:")[0].strip()
+            answer = response.split("Answer:")[1].strip()
+        elif "问题：" in response and "答案：" in response:
+            question = response.split("问题：")[1].split("答案：")[0].strip()
+            answer = response.split("答案：")[1].strip()
+        else:
+            logger.warning("Failed to parse response: %s", response)
+            return {}
+        question = question.strip('"')
+        answer = answer.strip('"')
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
+        return {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": answer,
+            }
+        }
diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py
new file mode 100644
index 00000000..a111a6f6
--- /dev/null
+++ b/graphgen/models/generator/cot_generator.py
@@ -0,0 +1,120 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import COT_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class CoTGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for COT Template Design.
+        :param batch:
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_TEMPLATE_DESIGN"].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
+    @staticmethod
+    def build_prompt_for_cot_generation(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]],
+        question: str,
+        reasoning_path: str,
+    ) -> str:
+        """
+        Build prompts for COT Generation.
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_GENERATION"].format(
+            entities=entities_str,
+            relationships=relationships_str,
+            question=question,
+            reasoning_template=reasoning_path,
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if "Question:" in response and "Reasoning-Path Design:" in response:
+            question = (
+                response.split("Question:")[1]
+                .split("Reasoning-Path Design:")[0]
+                .strip()
+            )
+            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
+        elif "问题：" in response and "推理路径设计：" in response:
+            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
+            reasoning_path = response.split("推理路径设计：")[1].strip()
+        else:
+            logger.warning("Failed to parse CoT template: %s", response)
+            return {}
+
+        question = question.strip('"')
+        reasoning_path = reasoning_path.strip('"')
+        logger.debug("CoT Question: %s", question)
+        logger.debug("CoT Reasoning Path: %s", reasoning_path)
+        return {
+            "question": question,
+            "reasoning_path": reasoning_path,
+        }
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        response = self.parse_response(response)
+        question, reasoning_path = response["question"], response["reasoning_path"]
+        prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
+        cot_answer = await self.llm_client.generate_answer(prompt)
+        logger.debug("CoT Answer: %s", cot_answer)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": cot_answer,
+                "reasoning_path": reasoning_path,
+            }
+        }
+        result.update(qa_pairs)
+        return result
diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py
new file mode 100644
index 00000000..9098b102
--- /dev/null
+++ b/graphgen/models/generator/multi_hop_generator.py
@@ -0,0 +1,53 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import MULTI_HOP_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class MultiHopGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if "Question:" in response and "Answer:" in response:
+            question = response.split("Question:")[1].split("Answer:")[0].strip()
+            answer = response.split("Answer:")[1].strip()
+        elif "问题：" in response and "答案：" in response:
+            question = response.split("问题：")[1].split("答案：")[0].strip()
+            answer = response.split("答案：")[1].strip()
+        else:
+            logger.warning("Failed to parse response: %s", response)
+            return {}
+        question = question.strip('"')
+        answer = answer.strip('"')
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
+        return {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": answer,
+            }
+        }
diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py
new file mode 100644
index 00000000..d5ac7ddb
--- /dev/null
+++ b/graphgen/models/generator/omics_qa_generator.py
@@ -0,0 +1,365 @@
+import re
+from typing import Any, Optional
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import OMICS_QA_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class OmicsQAGenerator(BaseGenerator):
+    """
+    Unified QA generator for multi-omics data (DNA, RNA, Protein).
+    Automatically extracts and attaches molecule-specific caption information to QA pairs.
+    """
+
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = OMICS_QA_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse the LLM response and return the generated QAs
+        :param response
+        :return: QA pairs
+        """
+        qa_pairs = {}
+        qa_list = response.strip().split("\n\n")
+        for qa in qa_list:
+            match = re.search(
+                r"Question:\s*(.*?)\s*Answer:\s*(.*)", qa, re.DOTALL
+            ) or re.search(r"问题：\s*(.*?)\s*答案：\s*(.*)", qa, re.DOTALL)
+
+            if match:
+                question = match.group(1).strip()
+                answer = match.group(2).strip()
+            else:
+                if qa:
+                    logger.error("Failed to parse QA pair: %s", qa)
+                continue
+            question = question.strip('"')
+            answer = answer.strip('"')
+            logger.debug("Question: %s", question)
+            logger.debug("Answer: %s", answer)
+            qa_pairs[compute_content_hash(question)] = {
+                "question": question,
+                "answer": answer,
+            }
+        return qa_pairs
+
+    @staticmethod
+    def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]:
+        """
+        Extract molecule-specific caption information from node data.
+
+        :param node_data: Node data dictionary
+        :param molecule_type: Type of molecule ("dna", "rna", or "protein")
+        :return: Caption dictionary or None
+        """
+        molecule_type_lower = molecule_type.lower()
+
+        # Check if there's already a caption field (e.g., protein_caption, dna_caption, rna_caption)
+        caption_key = f"{molecule_type_lower}_caption"
+        if caption_key in node_data and node_data[caption_key]:
+            if isinstance(node_data[caption_key], list) and len(node_data[caption_key]) > 0:
+                # Always return the first element if it's a dict, otherwise return None for consistency
+                caption_val = node_data[caption_key][0]
+                return caption_val if isinstance(caption_val, dict) else None
+            if isinstance(node_data[caption_key], dict):
+                return node_data[caption_key]
+
+        # Field mappings for each molecule type
+        field_mapping = {
+            "protein": [
+                "protein_name", "gene_names", "organism", "function",
+                "sequence", "id", "database", "entry_name", "uniprot_id"
+            ],
+            "dna": [
+                "gene_name", "gene_description", "organism", "chromosome",
+                "genomic_location", "function", "gene_type", "id",
+                "database", "sequence"
+            ],
+            "rna": [
+                "rna_type", "description", "organism", "related_genes",
+                "gene_name", "so_term", "id", "database",
+                "rnacentral_id", "sequence"
+            ],
+        }
+
+        # Extract fields based on molecule type
+        caption = {}
+        caption_fields = field_mapping.get(molecule_type_lower, [])
+        for field in caption_fields:
+            if field in node_data and node_data[field]:
+                caption[field] = node_data[field]
+
+        # Special handling for protein: check search results and existing protein field
+        if molecule_type_lower == "protein":
+            # Check for search result data (from UniProt search)
+            if "_search_results" in node_data:
+                search_results = node_data["_search_results"]
+                if isinstance(search_results, list) and len(search_results) > 0:
+                    first_result = search_results[0]
+                    if isinstance(first_result, dict):
+                        search_caption = {
+                            "id": first_result.get("id", ""),
+                            "protein_name": first_result.get("protein_name", ""),
+                            "gene_names": first_result.get("gene_names", []),
+                            "organism": first_result.get("organism", ""),
+                            "function": first_result.get("function", []),
+                            "sequence": node_data.get("sequence") or first_result.get("sequence", ""),
+                            "database": "UniProt"
+                        }
+                        # Remove empty fields and return if any data exists
+                        search_caption = {k: v for k, v in search_caption.items() if v}
+                        if search_caption:
+                            return search_caption
+
+            # Merge with existing protein field if present
+            if "protein" in node_data and node_data["protein"]:
+                existing_protein = node_data["protein"]
+                if isinstance(existing_protein, list) and len(existing_protein) > 0:
+                    existing_protein = (
+                        existing_protein[0]
+                        if isinstance(existing_protein[0], dict)
+                        else existing_protein
+                    )
+                if isinstance(existing_protein, dict):
+                    for key, value in existing_protein.items():
+                        if key not in caption and value:
+                            caption[key] = value
+                    # Ensure sequence from node_data takes precedence
+                    if "sequence" in node_data and node_data["sequence"]:
+                        caption["sequence"] = node_data["sequence"]
+
+        # Fallback to description if no caption found
+        if not caption and "description" in node_data:
+            description = node_data["description"]
+            if isinstance(description, str) and len(description) > 10:
+                caption["description"] = description
+
+        return caption if caption else None
+
+    @staticmethod
+    def _detect_molecule_type(nodes: list[tuple[str, dict]]) -> str:
+        """
+        Detect molecule type from nodes.
+        Priority: Check node type, then check metadata, then check node data fields.
+
+        :param nodes: List of (node_id, node_data) tuples
+        :return: Detected molecule type ("dna", "rna", "protein", or "unknown")
+        """
+        if not nodes:
+            return "unknown"
+
+        # Type indicators for each molecule type
+        type_indicators = {
+            "protein": {
+                "fields": ["protein_name", "uniprot_id", "entry_name", "protein_caption"],
+                "source_prefix": "protein-",
+                "description_keywords": ["protein"],
+            },
+            "dna": {
+                "fields": ["gene_name", "chromosome", "genomic_location"],
+                "source_prefix": "dna-",
+                "description_keywords": ["gene", "dna", "chromosome"],
+            },
+            "rna": {
+                "fields": ["rna_type", "rnacentral_id"],
+                "source_prefix": "rna-",
+                "description_keywords": ["rna", "transcript"],
+            },
+        }
+
+        for _, node_data in nodes:
+            # Priority 1: Check explicit type fields (most reliable)
+            for field in ["type", "molecule_type"]:
+                value = node_data.get(field, "").lower()
+                if value in ("dna", "rna", "protein"):
+                    return value
+
+            # Priority 2: Check source_id prefix
+            source_id = node_data.get("source_id", "").lower()
+            for mol_type, indicators in type_indicators.items():
+                if source_id.startswith(indicators["source_prefix"]):
+                    return mol_type
+
+            # Priority 3: Check type-specific fields
+            for mol_type, indicators in type_indicators.items():
+                if any(key in node_data for key in indicators["fields"]):
+                    # Special check for DNA: need chromosome or genomic_location
+                    if mol_type == "dna" and not any(key in node_data for key in ["chromosome", "genomic_location"]):
+                        continue
+                    return mol_type
+
+            # Priority 4: Check description keywords
+            description = node_data.get("description", "").lower()
+            for mol_type, indicators in type_indicators.items():
+                keywords = indicators["description_keywords"]
+                if any(kw in description for kw in keywords):
+                    # Special check: "protein" in description but not "gene"
+                    if mol_type == "protein" and "gene" in description:
+                        continue
+                    return mol_type
+
+        return "unknown"
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        Automatically extracts and attaches molecule-specific caption information.
+
+        :param batch
+        :return: QA pairs with attached molecule captions
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+
+        nodes, _ = batch
+
+        # Detect molecule type from nodes
+        molecule_type = self._detect_molecule_type(nodes)
+
+        # Extract captions for all molecule types from nodes
+        captions = {"dna": None, "rna": None, "protein": None}
+        caption_attached = False
+
+        for node in nodes:
+            _, node_data = node
+
+            # Check for pre-extracted captions (from partition_service)
+            for mol_type in ["dna", "rna", "protein"]:
+                caption_key = f"{mol_type}_caption"
+                if caption_key in node_data and node_data[caption_key]:
+                    captions[mol_type] = node_data[caption_key]
+                    caption_attached = True
+
+            # If no pre-extracted captions, extract from node_data using the detected molecule_type
+            if not caption_attached:
+                caption = self._extract_caption(node_data, molecule_type)
+                if caption:
+                    captions[molecule_type] = caption
+                    caption_attached = True
+                    break  # Only need to extract once per batch
+
+        # Attach all captions to QA pairs
+        for qa in qa_pairs.values():
+            qa["dna"] = captions["dna"] if captions["dna"] else ""
+            qa["rna"] = captions["rna"] if captions["rna"] else ""
+            qa["protein"] = captions["protein"] if captions["protein"] else ""
+
+        if not caption_attached:
+            node_sample = (
+                dict(list(nodes[0][1].items())[:5]) if nodes else 'No nodes'
+            )
+            logger.warning(
+                "No caption extracted for molecule_type=%s. Node data sample: %s",
+                molecule_type, node_sample
+            )
+
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        """
+        Format generation results with molecule-specific caption fields.
+        Supports dna, rna, and protein fields in output.
+        """
+        # Extract QA pairs and molecule captions
+        qa_items = [
+            {
+                "question": v["question"],
+                "answer": v["answer"],
+                "dna": v.get("dna", ""),
+                "rna": v.get("rna", ""),
+                "protein": v.get("protein", ""),
+            }
+            for item in results
+            for k, v in item.items()
+        ]
+
+        # Format based on output format
+        if output_data_format == "Alpaca":
+            return [
+                {
+                    "instruction": qa["question"],
+                    "input": "",
+                    "output": qa["answer"],
+                    "dna": qa["dna"],
+                    "rna": qa["rna"],
+                    "protein": qa["protein"],
+                }
+                for qa in qa_items
+            ]
+        if output_data_format == "Sharegpt":
+            return [
+                {
+                    "conversations": [
+                        {
+                            "from": "human",
+                            "value": [
+                                {
+                                    "text": qa["question"],
+                                    "dna": qa["dna"],
+                                    "rna": qa["rna"],
+                                    "protein": qa["protein"],
+                                }
+                            ],
+                        },
+                        {"from": "gpt", "value": qa["answer"]},
+                    ]
+                }
+                for qa in qa_items
+            ]
+        if output_data_format == "ChatML":
+            return [
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "text": qa["question"],
+                                    "dna": qa["dna"],
+                                    "rna": qa["rna"],
+                                    "protein": qa["protein"],
+                                }
+                            ],
+                        },
+                        {"role": "assistant", "content": qa["answer"]},
+                    ]
+                }
+                for qa in qa_items
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
diff --git a/graphgen/models/generator/quiz_generator.py b/graphgen/models/generator/quiz_generator.py
new file mode 100644
index 00000000..d117092d
--- /dev/null
+++ b/graphgen/models/generator/quiz_generator.py
@@ -0,0 +1,70 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
+from graphgen.utils import detect_main_language, logger
+
+
+class QuizGenerator(BaseGenerator):
+    """
+    Quiz Generator rephrases given descriptions to create quiz questions.
+    """
+
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompt for rephrasing the description.
+        :param batch: A tuple containing (nodes, edges) where nodes/edges
+                      contain description information
+        :return: Prompt string
+        """
+        # Extract description from batch
+        # For quiz generator, we expect a special format where
+        # the description is passed as the first node's description
+        nodes, edges = batch
+        if nodes:
+            description = nodes[0][1].get("description", "")
+            template_type = nodes[0][1].get("template_type", "TEMPLATE")
+        elif edges:
+            description = edges[0][2].get("description", "")
+            template_type = edges[0][2].get("template_type", "TEMPLATE")
+        else:
+            raise ValueError("Batch must contain at least one node or edge with description")
+
+        return QuizGenerator.build_prompt_for_description(description, template_type)
+
+    @staticmethod
+    def build_prompt_for_description(description: str, template_type: str = "TEMPLATE") -> str:
+        """
+        Build prompt for rephrasing a single description.
+        :param description: The description to rephrase
+        :param template_type: Either "TEMPLATE" (same meaning) or "ANTI_TEMPLATE" (opposite meaning)
+        :return: Prompt string
+        """
+        language = detect_main_language(description)
+        prompt = DESCRIPTION_REPHRASING_PROMPT[language][template_type].format(
+            input_sentence=description
+        )
+        return prompt
+
+    @staticmethod
+    def parse_rephrased_text(response: str) -> str:
+        """
+        Parse the rephrased text from the response.
+        :param response:
+        :return:
+        """
+        rephrased_text = response.strip().strip('"')
+        logger.debug("Rephrased Text: %s", rephrased_text)
+        return rephrased_text
+
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse the LLM response. For quiz generator, this returns the rephrased text.
+        :param response: LLM response
+        :return: Rephrased text
+        """
+        return QuizGenerator.parse_rephrased_text(response)
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
new file mode 100644
index 00000000..91b44862
--- /dev/null
+++ b/graphgen/models/generator/vqa_generator.py
@@ -0,0 +1,136 @@
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import VQA_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class VQAGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = VQA_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse the LLM response and return the generated QAs
+        :param response
+        :return: QA pairs
+        """
+        qa_pairs = {}
+        qa_list = response.strip().split("\n\n")
+        for qa in qa_list:
+            if "Question:" in qa and "Answer:" in qa:
+                question = qa.split("Question:")[1].split("Answer:")[0].strip()
+                answer = qa.split("Answer:")[1].strip()
+            elif "问题：" in qa and "答案：" in qa:
+                question = qa.split("问题：")[1].split("答案：")[0].strip()
+                answer = qa.split("答案：")[1].strip()
+            else:
+                logger.error("Failed to parse QA pair: %s", qa)
+                continue
+            question = question.strip('"')
+            answer = answer.strip('"')
+            logger.debug("Question: %s", question)
+            logger.debug("Answer: %s", answer)
+            qa_pairs[compute_content_hash(question)] = {
+                "question": question,
+                "answer": answer,
+            }
+        return qa_pairs
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        nodes, _ = batch
+        for node in nodes:
+            node_data = node[1]
+            if "image_data" in node_data and node_data["image_data"]:
+                img_path = node_data["image_data"]["img_path"]
+                for qa in qa_pairs.values():
+                    qa["img_path"] = img_path
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                    "image": v.get("img_path", ""),
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {
+                            "from": "human",
+                            "value": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results
diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py
new file mode 100644
index 00000000..aa0339c6
--- /dev/null
+++ b/graphgen/models/kg_builder/__init__.py
@@ -0,0 +1,3 @@
+from .light_rag_kg_builder import LightRAGKGBuilder
+from .mm_kg_builder import MMKGBuilder
+from .omics_kg_builder import OmicsKGBuilder
diff --git a/graphgen/models/kg_builder/light_rag_kg_builder.py b/graphgen/models/kg_builder/light_rag_kg_builder.py
new file mode 100644
index 00000000..a6185f44
--- /dev/null
+++ b/graphgen/models/kg_builder/light_rag_kg_builder.py
@@ -0,0 +1,218 @@
+import re
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk
+from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    pack_history_conversations,
+    split_string_by_multi_markers,
+)
+
+
+class LightRAGKGBuilder(BaseKGBuilder):
+    def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3):
+        super().__init__(llm_client)
+        self.max_loop = max_loop
+
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Extract entities and relationships from a single chunk using the LLM client.
+        :param chunk
+        :return: (nodes_data, edges_data)
+        """
+        chunk_id = chunk.id
+        content = chunk.content
+
+        # step 1: language_detection
+        language = detect_main_language(content)
+
+        hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
+            **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
+        )
+
+        # step 2: initial glean
+        final_result = await self.llm_client.generate_answer(hint_prompt)
+        logger.debug("First extraction result: %s", final_result)
+
+        # step3: iterative refinement
+        history = pack_history_conversations(hint_prompt, final_result)
+        for loop_idx in range(self.max_loop):
+            if_loop_result = await self.llm_client.generate_answer(
+                text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+
+            glean_result = await self.llm_client.generate_answer(
+                text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
+            )
+            logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result)
+
+            history += pack_history_conversations(
+                KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
+            )
+            final_result += glean_result
+
+        # step 4: parse the final result
+        records = split_string_by_multi_markers(
+            final_result,
+            [
+                KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+            ],
+        )
+
+        nodes = defaultdict(list)
+        edges = defaultdict(list)
+
+        for record in records:
+            match = re.search(r"\((.*)\)", record)
+            if not match:
+                continue
+            inner = match.group(1)
+
+            attributes = split_string_by_multi_markers(
+                inner, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+            )
+
+            entity = await handle_single_entity_extraction(attributes, chunk_id)
+            if entity is not None:
+                nodes[entity["entity_name"]].append(entity)
+                continue
+
+            relation = await handle_single_relationship_extraction(attributes, chunk_id)
+            if relation is not None:
+                key = (relation["src_id"], relation["tgt_id"])
+                edges[key].append(relation)
+
+        return dict(nodes), dict(edges)
+
+    async def merge_nodes(
+        self,
+        node_data: tuple[str, List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        entity_name, node_data = node_data
+        entity_types = []
+        source_ids = []
+        descriptions = []
+
+        node = kg_instance.get_node(entity_name)
+        if node is not None:
+            entity_types.append(node["entity_type"])
+            source_ids.extend(
+                split_string_by_multi_markers(node["source_id"], ["<SEP>"])
+            )
+            descriptions.append(node["description"])
+
+        # take the most frequent entity_type
+        entity_type = sorted(
+            Counter([dp["entity_type"] for dp in node_data] + entity_types).items(),
+            key=lambda x: x[1],
+            reverse=True,
+        )[0][0]
+
+        description = "<SEP>".join(
+            sorted(set([dp["description"] for dp in node_data] + descriptions))
+        )
+        description = await self._handle_kg_summary(entity_name, description)
+
+        source_id = "<SEP>".join(
+            set([dp["source_id"] for dp in node_data] + source_ids)
+        )
+
+        node_data = {
+            "entity_type": entity_type,
+            "description": description,
+            "source_id": source_id,
+        }
+        kg_instance.upsert_node(entity_name, node_data=node_data)
+
+    async def merge_edges(
+        self,
+        edges_data: tuple[Tuple[str, str], List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        (src_id, tgt_id), edge_data = edges_data
+
+        source_ids = []
+        descriptions = []
+
+        edge = kg_instance.get_edge(src_id, tgt_id)
+        if edge is not None:
+            source_ids.extend(
+                split_string_by_multi_markers(edge["source_id"], ["<SEP>"])
+            )
+            descriptions.append(edge["description"])
+
+        description = "<SEP>".join(
+            sorted(set([dp["description"] for dp in edge_data] + descriptions))
+        )
+        source_id = "<SEP>".join(
+            set([dp["source_id"] for dp in edge_data] + source_ids)
+        )
+
+        for insert_id in [src_id, tgt_id]:
+            if not kg_instance.has_node(insert_id):
+                kg_instance.upsert_node(
+                    insert_id,
+                    node_data={
+                        "source_id": source_id,
+                        "description": description,
+                        "entity_type": "UNKNOWN",
+                    },
+                )
+
+        description = await self._handle_kg_summary(
+            f"({src_id}, {tgt_id})", description
+        )
+
+        kg_instance.upsert_edge(
+            src_id,
+            tgt_id,
+            edge_data={"source_id": source_id, "description": description},
+        )
+
+    async def _handle_kg_summary(
+        self,
+        entity_or_relation_name: str,
+        description: str,
+        max_summary_tokens: int = 200,
+    ) -> str:
+        """
+        Handle knowledge graph summary
+
+        :param entity_or_relation_name
+        :param description
+        :param max_summary_tokens
+        :return summary
+        """
+
+        tokenizer_instance = self.llm_client.tokenizer
+        language = detect_main_language(description)
+
+        tokens = tokenizer_instance.encode(description)
+        if len(tokens) < max_summary_tokens:
+            return description
+
+        use_description = tokenizer_instance.decode(tokens[:max_summary_tokens])
+        prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
+            entity_name=entity_or_relation_name,
+            description_list=use_description.split("<SEP>"),
+            **KG_SUMMARIZATION_PROMPT["FORMAT"],
+        )
+        new_description = await self.llm_client.generate_answer(prompt)
+        logger.info(
+            "Entity or relation %s summary: %s",
+            entity_or_relation_name,
+            new_description,
+        )
+        return new_description
diff --git a/graphgen/models/kg_builder/mm_kg_builder.py b/graphgen/models/kg_builder/mm_kg_builder.py
new file mode 100644
index 00000000..f352cb2a
--- /dev/null
+++ b/graphgen/models/kg_builder/mm_kg_builder.py
@@ -0,0 +1,91 @@
+import re
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+from graphgen.bases import Chunk
+from graphgen.templates import MMKG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    split_string_by_multi_markers,
+)
+
+from .light_rag_kg_builder import LightRAGKGBuilder
+
+
+class MMKGBuilder(LightRAGKGBuilder):
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Extract entities and relationships from a single multi-modal chunk using the LLM client.
+        Expect to get a mini graph which contains a central multi-modal entity
+        and its related text entities and relationships.
+        Like:
+        (image: "image_of_eiffel_tower") --[located_in]--> (text: "Paris")
+        (image: "image_of_eiffel_tower") --[built_in]--> (text: "1889")
+        (text: "Eiffel Tower") --[height]--> (text: "324 meters")
+        :param chunk
+        """
+        chunk_id = chunk.id
+        chunk_type = chunk.type  # image | table | formula | ...
+        metadata = chunk.metadata
+
+        # choose different extraction strategies based on chunk type
+        if chunk_type == "image":
+            image_caption = "\n".join(metadata.get("image_caption", ""))
+            language = detect_main_language(image_caption)
+            prompt_template = MMKG_EXTRACTION_PROMPT[language].format(
+                **MMKG_EXTRACTION_PROMPT["FORMAT"],
+                chunk_type=chunk_type,
+                chunk_id=chunk_id,
+                chunk_text=image_caption,
+            )
+            result = await self.llm_client.generate_answer(prompt_template)
+            logger.debug("Image chunk extraction result: %s", result)
+
+            # parse the result
+            records = split_string_by_multi_markers(
+                result,
+                [
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
+            )
+
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+
+            for record in records:
+                match = re.search(r"\((.*)\)", record)
+                if not match:
+                    continue
+                inner = match.group(1)
+
+                attributes = split_string_by_multi_markers(
+                    inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+
+                entity = await handle_single_entity_extraction(attributes, chunk_id)
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+
+                relation = await handle_single_relationship_extraction(
+                    attributes, chunk_id
+                )
+                if relation is not None:
+                    key = (relation["src_id"], relation["tgt_id"])
+                    edges[key].append(relation)
+
+            return dict(nodes), dict(edges)
+
+        if chunk_type == "table":
+            pass  # TODO: implement table-based entity and relationship extraction
+        if chunk_type == "formula":
+            pass  # TODO: implement formula-based entity and relationship extraction
+
+        logger.error("Unsupported chunk type for MMKGBuilder: %s", chunk_type)
+        return defaultdict(list), defaultdict(list)
diff --git a/graphgen/models/kg_builder/omics_kg_builder.py b/graphgen/models/kg_builder/omics_kg_builder.py
new file mode 100644
index 00000000..6317a2b6
--- /dev/null
+++ b/graphgen/models/kg_builder/omics_kg_builder.py
@@ -0,0 +1,280 @@
+import re
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk
+from graphgen.templates import KG_SUMMARIZATION_PROMPT, OMICS_KG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    pack_history_conversations,
+    split_string_by_multi_markers,
+)
+
+
+class OmicsKGBuilder(BaseKGBuilder):
+    """
+    Knowledge graph builder for multi-omics data (DNA, RNA, protein).
+    Extracts entities and relationships from sequence chunks and their metadata.
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3):
+        super().__init__(llm_client)
+        self.max_loop = max_loop
+
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Extract entities and relationships from a sequence chunk using the LLM client.
+
+        :param chunk: Sequence chunk with metadata
+        :return: (nodes_data, edges_data)
+        """
+        chunk_id = chunk.id
+        content = chunk.content
+        metadata = chunk.metadata
+
+        # Extract sequence and metadata information
+        sequence_chunk = content or metadata.get("sequence", "")
+        # molecule_type is used in _format_metadata indirectly via metadata dict
+        _ = metadata.get("molecule_type", "").lower()
+
+        # Build metadata text for prompt
+        metadata_text = self._format_metadata(metadata)
+
+        # Detect language from metadata text (defaults to English if no Chinese detected)
+        language = detect_main_language(metadata_text)
+
+        # Build prompt with sequence and metadata
+        hint_prompt = OMICS_KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
+            **OMICS_KG_EXTRACTION_PROMPT["FORMAT"],
+            metadata_text=metadata_text,
+            sequence_chunk=sequence_chunk[:500] if sequence_chunk else "",  # Limit sequence length in prompt
+        )
+
+        # step 2: initial glean
+        final_result = await self.llm_client.generate_answer(hint_prompt)
+        logger.debug("First extraction result: %s", final_result)
+
+        # step 3: iterative refinement
+        history = pack_history_conversations(hint_prompt, final_result)
+        for loop_idx in range(self.max_loop):
+            if_loop_result = await self.llm_client.generate_answer(
+                text=OMICS_KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+
+            glean_result = await self.llm_client.generate_answer(
+                text=OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
+            )
+            logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result)
+
+            history += pack_history_conversations(
+                OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
+            )
+            final_result += glean_result
+
+        # step 4: parse the final result
+        records = split_string_by_multi_markers(
+            final_result,
+            [
+                OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+            ],
+        )
+
+        nodes = defaultdict(list)
+        edges = defaultdict(list)
+
+        for record in records:
+            match = re.search(r"\((.*)\)", record)
+            if not match:
+                continue
+            inner = match.group(1)
+
+            attributes = split_string_by_multi_markers(
+                inner, [OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+            )
+
+            entity = await handle_single_entity_extraction(attributes, chunk_id)
+            if entity is not None:
+                nodes[entity["entity_name"]].append(entity)
+                continue
+
+            relation = await handle_single_relationship_extraction(attributes, chunk_id)
+            if relation is not None:
+                key = (relation["src_id"], relation["tgt_id"])
+                edges[key].append(relation)
+
+        return dict(nodes), dict(edges)
+
+    @staticmethod
+    def _format_metadata(metadata: dict) -> str:
+        """
+        Format metadata dictionary into a readable text string for the prompt.
+
+        :param metadata: Metadata dictionary from chunk
+        :return: Formatted metadata text
+        """
+        # Filter out internal fields and format
+        exclude_fields = {
+            "_doc_id",
+            "_chunk_id",
+            "chunk_index",
+            "total_chunks",
+            "length",
+            "type",
+            "content",
+            "sequence",
+        }
+
+        metadata_items = []
+        for key, value in metadata.items():
+            if key in exclude_fields:
+                continue
+            if value is None or value == "":
+                continue
+            if isinstance(value, list):
+                value = ", ".join(str(v) for v in value)
+            metadata_items.append(f"{key}: {value}")
+
+        return "\n".join(metadata_items) if metadata_items else "No additional metadata available."
+
+    async def merge_nodes(
+        self,
+        node_data: tuple[str, List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        """Merge extracted nodes into the knowledge graph."""
+        entity_name, node_data = node_data
+        entity_types = []
+        source_ids = []
+        descriptions = []
+
+        node = kg_instance.get_node(entity_name)
+        if node is not None:
+            entity_types.append(node["entity_type"])
+            source_ids.extend(
+                split_string_by_multi_markers(node["source_id"], ["<SEP>"])
+            )
+            descriptions.append(node["description"])
+
+        # Take the most frequent entity_type
+        entity_type = sorted(
+            Counter([dp["entity_type"] for dp in node_data] + entity_types).items(),
+            key=lambda x: x[1],
+            reverse=True,
+        )[0][0]
+
+        description = "<SEP>".join(
+            sorted(set([dp["description"] for dp in node_data] + descriptions))
+        )
+        description = await self._handle_kg_summary(entity_name, description)
+
+        source_id = "<SEP>".join(
+            set([dp["source_id"] for dp in node_data] + source_ids)
+        )
+
+        node_data_dict = {
+            "entity_type": entity_type,
+            "description": description,
+            "source_id": source_id,
+        }
+
+        # Preserve sequence from existing node if present (e.g., added by partition_service)
+        if node is not None and "sequence" in node and node["sequence"]:
+            node_data_dict["sequence"] = node["sequence"]
+
+        kg_instance.upsert_node(entity_name, node_data=node_data_dict)
+
+    async def merge_edges(
+        self,
+        edges_data: tuple[Tuple[str, str], List[dict]],
+        kg_instance: BaseGraphStorage,
+    ) -> None:
+        """Merge extracted edges into the knowledge graph."""
+        (src_id, tgt_id), edge_data = edges_data
+
+        # Skip self-loops (edges where source and target are the same)
+        # This can happen when LLM extracts invalid relationships
+        if src_id == tgt_id:
+            logger.debug("Skipping self-loop edge: (%s, %s)", src_id, tgt_id)
+            return
+
+        source_ids = []
+        descriptions = []
+
+        edge = kg_instance.get_edge(src_id, tgt_id)
+        if edge is not None:
+            source_ids.extend(
+                split_string_by_multi_markers(edge["source_id"], ["<SEP>"])
+            )
+            descriptions.append(edge["description"])
+
+        description = "<SEP>".join(
+            sorted(set([dp["description"] for dp in edge_data] + descriptions))
+        )
+        source_id = "<SEP>".join(
+            set([dp["source_id"] for dp in edge_data] + source_ids)
+        )
+
+        for insert_id in [src_id, tgt_id]:
+            if not kg_instance.has_node(insert_id):
+                kg_instance.upsert_node(
+                    insert_id,
+                    node_data={
+                        "source_id": source_id,
+                        "description": description,
+                        "entity_type": "UNKNOWN",
+                    },
+                )
+
+        description = await self._handle_kg_summary(
+            f"({src_id}, {tgt_id})", description
+        )
+
+        kg_instance.upsert_edge(
+            src_id,
+            tgt_id,
+            edge_data={"source_id": source_id, "description": description},
+        )
+
+    async def _handle_kg_summary(
+        self,
+        entity_or_relation_name: str,
+        description: str,
+        max_summary_tokens: int = 200,
+    ) -> str:
+        """
+        Handle knowledge graph summary for omics entities/relations.
+
+        :param entity_or_relation_name
+        :param description
+        :param max_summary_tokens
+        :return summary
+        """
+        tokenizer_instance = self.llm_client.tokenizer
+        language = detect_main_language(description)
+
+        tokens = tokenizer_instance.encode(description)
+        if len(tokens) < max_summary_tokens:
+            return description
+
+        use_description = tokenizer_instance.decode(tokens[:max_summary_tokens])
+        prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
+            entity_name=entity_or_relation_name,
+            description_list=use_description.split("<SEP>"),
+            **KG_SUMMARIZATION_PROMPT["FORMAT"],
+        )
+        new_description = await self.llm_client.generate_answer(prompt)
+        logger.info(
+            "Entity or relation %s summary: %s",
+            entity_or_relation_name,
+            new_description,
+        )
+        return new_description
diff --git a/graphgen/models/llm/__init__.py b/graphgen/models/llm/__init__.py
index e69de29b..c70395d5 100644
--- a/graphgen/models/llm/__init__.py
+++ b/graphgen/models/llm/__init__.py
@@ -0,0 +1,4 @@
+from .api.http_client import HTTPClient
+from .api.ollama_client import OllamaClient
+from .api.openai_client import OpenAIClient
+from .local.hf_wrapper import HuggingFaceWrapper
diff --git a/graphgen/models/evaluate/__init__.py b/graphgen/models/llm/api/__init__.py
similarity index 100%
rename from graphgen/models/evaluate/__init__.py
rename to graphgen/models/llm/api/__init__.py
diff --git a/graphgen/models/llm/api/http_client.py b/graphgen/models/llm/api/http_client.py
new file mode 100644
index 00000000..2c3b0acd
--- /dev/null
+++ b/graphgen/models/llm/api/http_client.py
@@ -0,0 +1,197 @@
+import asyncio
+import math
+from typing import Any, Dict, List, Optional
+
+import aiohttp
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+from graphgen.models.llm.limitter import RPM, TPM
+
+
+class HTTPClient(BaseLLMWrapper):
+    """
+    A generic async HTTP client for LLMs compatible with OpenAI's chat/completions format.
+    It uses aiohttp for making requests and includes retry logic and token usage tracking.
+    Usage example:
+        client = HTTPClient(
+            model_name="gpt-4o-mini",
+            base_url="http://localhost:8080",
+            api_key="your_api_key",
+            json_mode=True,
+            seed=42,
+            topk_per_token=5,
+            request_limit=True,
+        )
+
+        answer = await client.generate_answer("Hello, world!")
+        tokens = await client.generate_topk_per_token("Hello, world!")
+    """
+
+    _instance: Optional["HTTPClient"] = None
+    _lock = asyncio.Lock()
+
+    def __new__(cls, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(
+        self,
+        *,
+        model: str,
+        base_url: str,
+        api_key: Optional[str] = None,
+        json_mode: bool = False,
+        seed: Optional[int] = None,
+        topk_per_token: int = 5,
+        request_limit: bool = False,
+        rpm: Optional[RPM] = None,
+        tpm: Optional[TPM] = None,
+        **kwargs: Any,
+    ):
+        # Initialize only once in the singleton pattern
+        if getattr(self, "_initialized", False):
+            return
+        self._initialized: bool = True
+        super().__init__(**kwargs)
+        self.model_name = model
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.json_mode = json_mode
+        self.seed = seed
+        self.topk_per_token = topk_per_token
+        self.request_limit = request_limit
+        self.rpm = rpm or RPM()
+        self.tpm = tpm or TPM()
+
+        self.token_usage: List[Dict[str, int]] = []
+        self._session: Optional[aiohttp.ClientSession] = None
+
+    @property
+    def session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            headers = (
+                {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {}
+            )
+            self._session = aiohttp.ClientSession(headers=headers)
+        return self._session
+
+    async def close(self):
+        if self._session and not self._session.closed:
+            await self._session.close()
+
+    def _build_body(self, text: str, history: List[str]) -> Dict[str, Any]:
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+
+        # chatml format: alternating user and assistant messages
+        if history and isinstance(history[0], dict):
+            messages.extend(history)
+
+        messages.append({"role": "user", "content": text})
+
+        body = {
+            "model": self.model_name,
+            "messages": messages,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "max_tokens": self.max_tokens,
+        }
+        if self.seed:
+            body["seed"] = self.seed
+        if self.json_mode:
+            body["response_format"] = {"type": "json_object"}
+        return body
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
+    )
+    async def generate_answer(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> str:
+        body = self._build_body(text, history or [])
+        prompt_tokens = sum(
+            len(self.tokenizer.encode(m["content"])) for m in body["messages"]
+        )
+        est = prompt_tokens + body["max_tokens"]
+
+        if self.request_limit:
+            await self.rpm.wait(silent=True)
+            await self.tpm.wait(est, silent=True)
+
+        async with self.session.post(
+            f"{self.base_url}/chat/completions",
+            json=body,
+            timeout=aiohttp.ClientTimeout(total=60),
+        ) as resp:
+            resp.raise_for_status()
+            data = await resp.json()
+
+        msg = data["choices"][0]["message"]["content"]
+        if "usage" in data:
+            self.token_usage.append(
+                {
+                    "prompt_tokens": data["usage"]["prompt_tokens"],
+                    "completion_tokens": data["usage"]["completion_tokens"],
+                    "total_tokens": data["usage"]["total_tokens"],
+                }
+            )
+        return self.filter_think_tags(msg)
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
+    )
+    async def generate_topk_per_token(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> List[Token]:
+        body = self._build_body(text, history or [])
+        body["max_tokens"] = 1
+        if self.topk_per_token > 0:
+            body["logprobs"] = True
+            body["top_logprobs"] = self.topk_per_token
+
+        async with self.session.post(
+            f"{self.base_url}/chat/completions",
+            json=body,
+            timeout=aiohttp.ClientTimeout(total=60),
+        ) as resp:
+            resp.raise_for_status()
+            data = await resp.json()
+
+        token_logprobs = data["choices"][0]["logprobs"]["content"]
+        tokens = []
+        for item in token_logprobs:
+            candidates = [
+                Token(t["token"], math.exp(t["logprob"])) for t in item["top_logprobs"]
+            ]
+            tokens.append(
+                Token(
+                    item["token"], math.exp(item["logprob"]), top_candidates=candidates
+                )
+            )
+        return tokens
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        raise NotImplementedError(
+            "generate_inputs_prob is not implemented in HTTPClient"
+        )
diff --git a/graphgen/models/llm/api/ollama_client.py b/graphgen/models/llm/api/ollama_client.py
new file mode 100644
index 00000000..9a4946a6
--- /dev/null
+++ b/graphgen/models/llm/api/ollama_client.py
@@ -0,0 +1,105 @@
+from typing import Any, Dict, List, Optional
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+from graphgen.models.llm.limitter import RPM, TPM
+
+
+class OllamaClient(BaseLLMWrapper):
+    """
+    Requires a local or remote Ollama server to be running (default port 11434).
+    The top_logprobs field is not yet implemented by the official API.
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str = "gemma3",
+        base_url: str = "http://localhost:11434",
+        json_mode: bool = False,
+        seed: Optional[int] = None,
+        topk_per_token: int = 5,
+        request_limit: bool = False,
+        rpm: Optional[RPM] = None,
+        tpm: Optional[TPM] = None,
+        **kwargs: Any,
+    ):
+        try:
+            import ollama
+        except ImportError as e:
+            raise ImportError(
+                "Ollama SDK is not installed."
+                "It is required to use OllamaClient."
+                "Please install it with `pip install ollama`."
+            ) from e
+        super().__init__(**kwargs)
+        self.model_name = model
+        self.base_url = base_url
+        self.json_mode = json_mode
+        self.seed = seed
+        self.topk_per_token = topk_per_token
+        self.request_limit = request_limit
+        self.rpm = rpm or RPM()
+        self.tpm = tpm or TPM()
+        self.token_usage: List[Dict[str, int]] = []
+
+        self.client = ollama.AsyncClient(host=self.base_url)
+
+    async def generate_answer(
+        self,
+        text: str,
+        history: Optional[List[Dict[str, str]]] = None,
+        **extra: Any,
+    ) -> str:
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        if history:
+            messages.extend(history)
+        messages.append({"role": "user", "content": text})
+
+        options = {
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "num_predict": self.max_tokens,
+        }
+        if self.seed is not None:
+            options["seed"] = self.seed
+
+        prompt_tokens = sum(len(self.tokenizer.encode(m["content"])) for m in messages)
+        est = prompt_tokens + self.max_tokens
+        if self.request_limit:
+            await self.rpm.wait(silent=True)
+            await self.tpm.wait(est, silent=True)
+
+        response = await self.client.chat(
+            model=self.model_name,
+            messages=messages,
+            format="json" if self.json_mode else "",
+            options=options,
+            stream=False,
+        )
+
+        usage = response.get("prompt_eval_count", 0), response.get("eval_count", 0)
+        self.token_usage.append(
+            {
+                "prompt_tokens": usage[0],
+                "completion_tokens": usage[1],
+                "total_tokens": sum(usage),
+            }
+        )
+        content = response["message"]["content"]
+        return self.filter_think_tags(content)
+
+    async def generate_topk_per_token(
+        self,
+        text: str,
+        history: Optional[List[Dict[str, str]]] = None,
+        **extra: Any,
+    ) -> List[Token]:
+        raise NotImplementedError("Ollama API does not support per-token top-k yet.")
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[Dict[str, str]]] = None, **extra: Any
+    ) -> List[Token]:
+        raise NotImplementedError("Ollama API does not support per-token logprobs yet.")
diff --git a/graphgen/models/llm/api/openai_client.py b/graphgen/models/llm/api/openai_client.py
new file mode 100644
index 00000000..532b981c
--- /dev/null
+++ b/graphgen/models/llm/api/openai_client.py
@@ -0,0 +1,182 @@
+import math
+from typing import Any, Dict, List, Optional
+
+import openai
+from openai import APIConnectionError, APITimeoutError, AsyncOpenAI, AsyncAzureOpenAI, RateLimitError
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+from graphgen.models.llm.limitter import RPM, TPM
+
+
+def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
+    token_logprobs = response.choices[0].logprobs.content
+    tokens = []
+    for token_prob in token_logprobs:
+        prob = math.exp(token_prob.logprob)
+        candidate_tokens = [
+            Token(t.token, math.exp(t.logprob)) for t in token_prob.top_logprobs
+        ]
+        token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
+        tokens.append(token)
+    return tokens
+
+
+class OpenAIClient(BaseLLMWrapper):
+    def __init__(
+        self,
+        *,
+        model: str = "gpt-4o-mini",
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        api_version: Optional[str] = None,
+        json_mode: bool = False,
+        seed: Optional[int] = None,
+        topk_per_token: int = 5,  # number of topk tokens to generate for each token
+        request_limit: bool = False,
+        rpm: Optional[RPM] = None,
+        tpm: Optional[TPM] = None,
+        backend: str = "openai_api",
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.model = model
+        self.api_key = api_key
+        self.api_version = api_version # required for Azure OpenAI
+        self.base_url = base_url
+        self.json_mode = json_mode
+        self.seed = seed
+        self.topk_per_token = topk_per_token
+
+        self.token_usage: list = []
+        self.request_limit = request_limit
+        self.rpm = rpm or RPM()
+        self.tpm = tpm or TPM()
+
+        assert (
+            backend in ("openai_api", "azure_openai_api")
+        ), f"Unsupported backend '{backend}'. Use 'openai_api' or 'azure_openai_api'."
+        self.backend = backend
+
+        self.__post_init__()
+
+    def __post_init__(self):
+
+        api_name = self.backend.replace("_", " ")
+        assert self.api_key is not None, f"Please provide api key to access {api_name}."
+        if self.backend == "openai_api":
+            self.client = AsyncOpenAI(
+                api_key=self.api_key or "dummy", base_url=self.base_url
+            )
+        elif self.backend == "azure_openai_api":
+            assert self.api_version is not None, f"Please provide api_version for {api_name}."
+            assert self.base_url is not None, f"Please provide base_url for {api_name}."
+            self.client = AsyncAzureOpenAI(
+                api_key=self.api_key,
+                azure_endpoint=self.base_url,
+                api_version=self.api_version,
+                azure_deployment=self.model,
+            )
+        else:
+            raise ValueError(f"Unsupported backend {self.backend}. Use 'openai_api' or 'azure_openai_api'.")
+
+    def _pre_generate(self, text: str, history: List[str]) -> Dict:
+        kwargs = {
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "max_tokens": self.max_tokens,
+        }
+        if self.seed:
+            kwargs["seed"] = self.seed
+        if self.json_mode:
+            kwargs["response_format"] = {"type": "json_object"}
+
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": text})
+
+        if history:
+            assert len(history) % 2 == 0, "History should have even number of elements."
+            messages = history + messages
+
+        kwargs["messages"] = messages
+        return kwargs
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (RateLimitError, APIConnectionError, APITimeoutError)
+        ),
+    )
+    async def generate_topk_per_token(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> List[Token]:
+        kwargs = self._pre_generate(text, history)
+        if self.topk_per_token > 0:
+            kwargs["logprobs"] = True
+            kwargs["top_logprobs"] = self.topk_per_token
+
+        # Limit max_tokens to 1 to avoid long completions
+        kwargs["max_tokens"] = 1
+
+        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
+            model=self.model, **kwargs
+        )
+
+        tokens = get_top_response_tokens(completion)
+
+        return tokens
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (RateLimitError, APIConnectionError, APITimeoutError)
+        ),
+    )
+    async def generate_answer(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> str:
+        kwargs = self._pre_generate(text, history)
+
+        prompt_tokens = 0
+        for message in kwargs["messages"]:
+            prompt_tokens += len(self.tokenizer.encode(message["content"]))
+        estimated_tokens = prompt_tokens + kwargs["max_tokens"]
+
+        if self.request_limit:
+            await self.rpm.wait(silent=True)
+            await self.tpm.wait(estimated_tokens, silent=True)
+
+        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
+            model=self.model, **kwargs
+        )
+        if hasattr(completion, "usage"):
+            self.token_usage.append(
+                {
+                    "prompt_tokens": completion.usage.prompt_tokens,
+                    "completion_tokens": completion.usage.completion_tokens,
+                    "total_tokens": completion.usage.total_tokens,
+                }
+            )
+        return self.filter_think_tags(completion.choices[0].message.content)
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        """Generate probabilities for each token in the input."""
+        raise NotImplementedError
diff --git a/graphgen/models/llm/limitter.py b/graphgen/models/llm/limitter.py
index 01cb1f70..5aee4501 100644
--- a/graphgen/models/llm/limitter.py
+++ b/graphgen/models/llm/limitter.py
@@ -1,17 +1,17 @@
+import asyncio
 import time
 from datetime import datetime, timedelta
-import asyncio
 
 from graphgen.utils import logger
 
 
 class RPM:
-
     def __init__(self, rpm: int = 1000):
         self.rpm = rpm
-        self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
+        self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
 
-    def get_minute_slot(self):
+    @staticmethod
+    def get_minute_slot():
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
@@ -22,37 +22,35 @@ async def wait(self, silent=False):
         dt_object = datetime.fromtimestamp(current)
         minute_slot = self.get_minute_slot()
 
-        if self.record['rpm_slot'] == minute_slot:
+        if self.record["rpm_slot"] == minute_slot:
             # check RPM exceed
-            if self.record['counter'] >= self.rpm:
+            if self.record["counter"] >= self.rpm:
                 # wait until next minute
-                next_minute = dt_object.replace(
-                    second=0, microsecond=0) + timedelta(minutes=1)
+                next_minute = dt_object.replace(second=0, microsecond=0) + timedelta(
+                    minutes=1
+                )
                 _next = next_minute.timestamp()
                 sleep_time = abs(_next - current)
                 if not silent:
-                    logger.info('RPM sleep %s', sleep_time)
+                    logger.info("RPM sleep %s", sleep_time)
                 await asyncio.sleep(sleep_time)
 
-                self.record = {
-                    'rpm_slot': self.get_minute_slot(),
-                    'counter': 0
-                }
+                self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
         else:
-            self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
-        self.record['counter'] += 1
+            self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
+        self.record["counter"] += 1
 
         if not silent:
             logger.debug(self.record)
 
 
 class TPM:
-
     def __init__(self, tpm: int = 20000):
         self.tpm = tpm
-        self.record = {'tpm_slot': self.get_minute_slot(), 'counter': 0}
+        self.record = {"tpm_slot": self.get_minute_slot(), "counter": 0}
 
-    def get_minute_slot(self):
+    @staticmethod
+    def get_minute_slot():
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
@@ -64,25 +62,25 @@ async def wait(self, token_count, silent=False):
         minute_slot = self.get_minute_slot()
 
         # get next slot, skip
-        if self.record['tpm_slot'] != minute_slot:
-            self.record = {'tpm_slot': minute_slot, 'counter': token_count}
+        if self.record["tpm_slot"] != minute_slot:
+            self.record = {"tpm_slot": minute_slot, "counter": token_count}
             return
 
         # check RPM exceed
-        self.record['counter'] += token_count
-        if self.record['counter'] > self.tpm:
+        old_counter = self.record["counter"]
+        self.record["counter"] += token_count
+        if self.record["counter"] > self.tpm:
+            logger.info("Current TPM: %s, limit: %s", old_counter, self.tpm)
             # wait until next minute
-            next_minute = dt_object.replace(
-                second=0, microsecond=0) + timedelta(minutes=1)
+            next_minute = dt_object.replace(second=0, microsecond=0) + timedelta(
+                minutes=1
+            )
             _next = next_minute.timestamp()
             sleep_time = abs(_next - current)
-            logger.info('TPM sleep %s', sleep_time)
+            logger.warning("TPM limit exceeded, wait %s seconds", sleep_time)
             await asyncio.sleep(sleep_time)
 
-            self.record = {
-                'tpm_slot': self.get_minute_slot(),
-                'counter': token_count
-            }
+            self.record = {"tpm_slot": self.get_minute_slot(), "counter": token_count}
 
         if not silent:
             logger.debug(self.record)
diff --git a/graphgen/models/search/__init__.py b/graphgen/models/llm/local/__init__.py
similarity index 100%
rename from graphgen/models/search/__init__.py
rename to graphgen/models/llm/local/__init__.py
diff --git a/graphgen/models/llm/local/hf_wrapper.py b/graphgen/models/llm/local/hf_wrapper.py
new file mode 100644
index 00000000..b0538aad
--- /dev/null
+++ b/graphgen/models/llm/local/hf_wrapper.py
@@ -0,0 +1,147 @@
+from typing import Any, List, Optional
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+
+
+class HuggingFaceWrapper(BaseLLMWrapper):
+    """
+    Async inference backend based on HuggingFace Transformers
+    """
+
+    def __init__(
+        self,
+        model: str,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+        temperature=0.0,
+        top_p=1.0,
+        topk=5,
+        **kwargs: Any,
+    ):
+        super().__init__(temperature=temperature, top_p=top_p, **kwargs)
+
+        try:
+            import torch
+            from transformers import (
+                AutoModelForCausalLM,
+                AutoTokenizer,
+                GenerationConfig,
+            )
+        except ImportError as exc:
+            raise ImportError(
+                "HuggingFaceWrapper requires torch, transformers and accelerate. "
+                "Install them with:  pip install torch transformers accelerate"
+            ) from exc
+
+        self.torch = torch
+        self.AutoTokenizer = AutoTokenizer
+        self.AutoModelForCausalLM = AutoModelForCausalLM
+        self.GenerationConfig = GenerationConfig
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model, trust_remote_code=trust_remote_code
+        )
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            trust_remote_code=trust_remote_code,
+        )
+        self.model.eval()
+        self.temperature = temperature
+        self.top_p = top_p
+        self.topk = topk
+
+    @staticmethod
+    def _build_inputs(prompt: str, history: Optional[List[str]] = None) -> str:
+        msgs = history or []
+        lines = []
+        for m in msgs:
+            if isinstance(m, dict):
+                role = m.get("role", "")
+                content = m.get("content", "")
+                lines.append(f"{role}: {content}")
+            else:
+                lines.append(str(m))
+        lines.append(prompt)
+        return "\n".join(lines)
+
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> str:
+        full = self._build_inputs(text, history)
+        inputs = self.tokenizer(full, return_tensors="pt").to(self.model.device)
+
+        gen_kwargs = {
+            "max_new_tokens": extra.get("max_new_tokens", 512),
+            "do_sample": self.temperature > 0,
+            "temperature": self.temperature if self.temperature > 0 else 1.0,
+            "pad_token_id": self.tokenizer.eos_token_id,
+        }
+
+        # Add top_p and top_k only if temperature > 0
+        if self.temperature > 0:
+            gen_kwargs.update(top_p=self.top_p, top_k=self.topk)
+
+        gen_config = self.GenerationConfig(**gen_kwargs)
+
+        with self.torch.no_grad():
+            out = self.model.generate(**inputs, generation_config=gen_config)
+
+        gen = out[0, inputs.input_ids.shape[-1] :]
+        return self.tokenizer.decode(gen, skip_special_tokens=True)
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        full = self._build_inputs(text, history)
+        inputs = self.tokenizer(full, return_tensors="pt").to(self.model.device)
+
+        with self.torch.no_grad():
+            out = self.model.generate(
+                **inputs,
+                max_new_tokens=1,
+                do_sample=False,
+                temperature=1.0,
+                return_dict_in_generate=True,
+                output_scores=True,
+                pad_token_id=self.tokenizer.eos_token_id,
+            )
+
+        scores = out.scores[0][0]  # (vocab,)
+        probs = self.torch.softmax(scores, dim=-1)
+        top_probs, top_idx = self.torch.topk(probs, k=self.topk)
+
+        tokens = []
+        for p, idx in zip(top_probs.cpu().numpy(), top_idx.cpu().numpy()):
+            tokens.append(Token(self.tokenizer.decode([idx]), float(p)))
+        return tokens
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        full = self._build_inputs(text, history)
+        ids = self.tokenizer.encode(full)
+        logprobs = []
+
+        for i in range(1, len(ids) + 1):
+            trunc = ids[: i - 1] + ids[i:] if i < len(ids) else ids[:-1]
+            inputs = self.torch.tensor([trunc]).to(self.model.device)
+
+            with self.torch.no_grad():
+                logits = self.model(inputs).logits[0, -1, :]
+            probs = self.torch.softmax(logits, dim=-1)
+
+            true_id = ids[i - 1]
+            logprobs.append(
+                Token(
+                    self.tokenizer.decode([true_id]),
+                    float(probs[true_id].cpu()),
+                )
+            )
+        return logprobs
diff --git a/graphgen/models/llm/local/sglang_wrapper.py b/graphgen/models/llm/local/sglang_wrapper.py
new file mode 100644
index 00000000..1918fc79
--- /dev/null
+++ b/graphgen/models/llm/local/sglang_wrapper.py
@@ -0,0 +1,140 @@
+import math
+from typing import Any, Dict, List, Optional
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+
+
+class SGLangWrapper(BaseLLMWrapper):
+    """
+    Async inference backend based on SGLang offline engine.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        tp_size: int = 1,
+        temperature: float = 0.0,
+        top_p: float = 1.0,
+        topk: int = 5,
+        **kwargs: Any,
+    ):
+        super().__init__(temperature=temperature, top_p=top_p, **kwargs)
+        try:
+            import sglang as sgl
+            from sglang.utils import async_stream_and_merge, stream_and_merge
+        except ImportError as exc:
+            raise ImportError(
+                "SGLangWrapper requires sglang. Install it with: "
+                "uv pip install sglang --prerelease=allow"
+            ) from exc
+
+        self.model_path: str = model
+        self.temperature = temperature
+        self.top_p = top_p
+        self.topk = topk
+        self.tp_size = int(tp_size)
+
+        # Initialise the offline engine
+        self.engine = sgl.Engine(model_path=self.model_path, tp_size=self.tp_size)
+
+        # Keep helpers for streaming
+        self.async_stream_and_merge = async_stream_and_merge
+        self.stream_and_merge = stream_and_merge
+
+    @staticmethod
+    def _build_sampling_params(
+        temperature: float,
+        top_p: float,
+        max_tokens: int,
+        topk: int,
+        logprobs: bool = False,
+    ) -> Dict[str, Any]:
+        """Build SGLang-compatible sampling-params dict."""
+        params = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_new_tokens": max_tokens,
+        }
+        if logprobs and topk > 0:
+            params["logprobs"] = topk
+        return params
+
+    def _prep_prompt(self, text: str, history: Optional[List[dict]] = None) -> str:
+        """Convert raw text (+ optional history) into a single prompt string."""
+        parts = []
+        if self.system_prompt:
+            parts.append(self.system_prompt)
+        if history:
+            assert len(history) % 2 == 0, "History must have even length (u/a turns)."
+            parts.extend([item["content"] for item in history])
+        parts.append(text)
+        return "\n".join(parts)
+
+    def _tokens_from_output(self, output: Dict[str, Any]) -> List[Token]:
+        tokens: List[Token] = []
+
+        meta = output.get("meta_info", {})
+        logprobs = meta.get("output_token_logprobs", [])
+        topks = meta.get("output_top_logprobs", [])
+
+        tokenizer = self.engine.tokenizer_manager.tokenizer
+
+        for idx, (lp, tid, _) in enumerate(logprobs):
+            prob = math.exp(lp)
+            tok_str = tokenizer.decode([tid])
+
+            top_candidates = []
+            if self.topk > 0 and idx < len(topks):
+                for t_lp, t_tid, _ in topks[idx][: self.topk]:
+                    top_candidates.append(
+                        Token(text=tokenizer.decode([t_tid]), prob=math.exp(t_lp))
+                    )
+
+            tokens.append(Token(text=tok_str, prob=prob, top_candidates=top_candidates))
+
+        return tokens
+
+    async def generate_answer(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> str:
+        prompt = self._prep_prompt(text, history)
+        sampling_params = self._build_sampling_params(
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_tokens=self.max_tokens,
+            topk=0,  # no logprobs needed for simple generation
+        )
+
+        outputs = await self.engine.async_generate([prompt], sampling_params)
+        return self.filter_think_tags(outputs[0]["text"])
+
+    async def generate_topk_per_token(
+        self,
+        text: str,
+        history: Optional[List[str]] = None,
+        **extra: Any,
+    ) -> List[Token]:
+        prompt = self._prep_prompt(text, history)
+        sampling_params = self._build_sampling_params(
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_tokens=1,  # keep short for token-level analysis
+            topk=self.topk,
+        )
+
+        outputs = await self.engine.async_generate(
+            [prompt], sampling_params, return_logprob=True, top_logprobs_num=5
+        )
+        print(outputs)
+        return self._tokens_from_output(outputs[0])
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        raise NotImplementedError(
+            "SGLangWrapper does not support per-token logprobs yet."
+        )
diff --git a/graphgen/models/llm/local/tgi_wrapper.py b/graphgen/models/llm/local/tgi_wrapper.py
new file mode 100644
index 00000000..a722f6ea
--- /dev/null
+++ b/graphgen/models/llm/local/tgi_wrapper.py
@@ -0,0 +1,36 @@
+from typing import Any, List, Optional
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+
+
+# TODO: implement TGIWrapper methods
+class TGIWrapper(BaseLLMWrapper):
+    """
+    Async inference backend based on TGI (Text-Generation-Inference)
+    """
+
+    def __init__(
+        self,
+        model_url: str,  # e.g. "http://localhost:8080"
+        temperature: float = 0.0,
+        top_p: float = 1.0,
+        topk: int = 5,
+        **kwargs: Any
+    ):
+        super().__init__(temperature=temperature, top_p=top_p, **kwargs)
+
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> str:
+        pass
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        pass
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        pass
diff --git a/graphgen/models/llm/local/trt_wrapper.py b/graphgen/models/llm/local/trt_wrapper.py
new file mode 100644
index 00000000..078f5ba9
--- /dev/null
+++ b/graphgen/models/llm/local/trt_wrapper.py
@@ -0,0 +1,26 @@
+from typing import Any, List, Optional
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+
+
+# TODO: implement TensorRTWrapper methods
+class TensorRTWrapper(BaseLLMWrapper):
+    """
+    Async inference backend based on TensorRT-LLM
+    """
+
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> str:
+        pass
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        pass
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        pass
diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py
new file mode 100644
index 00000000..5d18e99c
--- /dev/null
+++ b/graphgen/models/llm/local/vllm_wrapper.py
@@ -0,0 +1,126 @@
+import uuid
+import math
+from typing import Any, List, Optional
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+from graphgen.bases.datatypes import Token
+
+class VLLMWrapper(BaseLLMWrapper):
+    """
+    Async inference backend based on vLLM.
+    """
+    def __init__(
+        self,
+        model: str,
+        tensor_parallel_size: int = 1,
+        gpu_memory_utilization: float = 0.9,
+        temperature: float = 0.0,
+        top_p: float = 1.0,
+        topk: int = 5,
+        **kwargs: Any,
+    ):
+        super().__init__(temperature=temperature, top_p=top_p, **kwargs)
+        try:
+            from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
+        except ImportError as exc:
+            raise ImportError(
+                "VLLMWrapper requires vllm. Install it with: uv pip install vllm"
+            ) from exc
+
+        self.SamplingParams = SamplingParams
+
+        engine_args = AsyncEngineArgs(
+            model=model,
+            tensor_parallel_size=int(tensor_parallel_size),
+            gpu_memory_utilization=float(gpu_memory_utilization),
+            trust_remote_code=kwargs.get("trust_remote_code", True),
+            disable_log_stats=False,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+        self.temperature = temperature
+        self.top_p = top_p
+        self.topk = topk
+
+    @staticmethod
+    def _build_inputs(prompt: str, history: Optional[List[str]] = None) -> str:
+        msgs = history or []
+        lines = []
+        for m in msgs:
+            if isinstance(m, dict):
+                role = m.get("role", "")
+                content = m.get("content", "")
+                lines.append(f"{role}: {content}")
+            else:
+                lines.append(str(m))
+        lines.append(prompt)
+        return "\n".join(lines)
+
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> str:
+        full_prompt = self._build_inputs(text, history)
+        request_id = f"graphgen_req_{uuid.uuid4()}"
+
+        sp = self.SamplingParams(
+            temperature=self.temperature if self.temperature > 0 else 1.0,
+            top_p=self.top_p if self.temperature > 0 else 1.0,
+            max_tokens=extra.get("max_new_tokens", 512),
+        )
+
+        result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
+
+        final_output = None
+        async for request_output in result_generator:
+            final_output = request_output
+
+        if not final_output or not final_output.outputs:
+            return ""
+
+        return final_output.outputs[0].text
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+        ) -> List[Token]:
+        full_prompt = self._build_inputs(text, history)
+        request_id = f"graphgen_topk_{uuid.uuid4()}"
+
+        sp = self.SamplingParams(
+            temperature=0,
+            max_tokens=1,
+            logprobs=self.topk,
+            prompt_logprobs=1,
+        )
+
+        result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
+
+        final_output = None
+        async for request_output in result_generator:
+            final_output = request_output
+
+        if not final_output or not final_output.outputs or not final_output.outputs[0].logprobs:
+            return []
+
+        top_logprobs = final_output.outputs[0].logprobs[0]
+
+        candidate_tokens = []
+        for _, logprob_obj in top_logprobs.items():
+            tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            prob = float(math.exp(logprob_obj.logprob))
+            candidate_tokens.append(Token(tok_str, prob))
+
+        candidate_tokens.sort(key=lambda x: -x.prob)
+
+        if candidate_tokens:
+            main_token = Token(
+                text=candidate_tokens[0].text,
+                prob=candidate_tokens[0].prob,
+                top_candidates=candidate_tokens
+            )
+            return [main_token]
+        return []
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None, **extra: Any
+    ) -> List[Token]:
+        raise NotImplementedError(
+            "VLLMWrapper does not support per-token logprobs yet."
+        )
diff --git a/graphgen/models/llm/openai_model.py b/graphgen/models/llm/openai_model.py
deleted file mode 100644
index 6973c1ce..00000000
--- a/graphgen/models/llm/openai_model.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import math
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional
-import openai
-from openai import AsyncOpenAI, RateLimitError, APIConnectionError, APITimeoutError
-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_exponential,
-    retry_if_exception_type,
-)
-
-from graphgen.models.llm.topk_token_model import TopkTokenModel, Token
-from graphgen.models.llm.tokenizer import Tokenizer
-from graphgen.models.llm.limitter import RPM, TPM
-
-def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
-    token_logprobs = response.choices[0].logprobs.content
-    tokens = []
-    for token_prob in token_logprobs:
-        prob = math.exp(token_prob.logprob)
-        candidate_tokens = [
-            Token(t.token, math.exp(t.logprob))
-            for t in token_prob.top_logprobs
-        ]
-        token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
-        tokens.append(token)
-    return tokens
-
-@dataclass
-class OpenAIModel(TopkTokenModel):
-    model_name: str = "gpt-4o-mini"
-    api_key: str = None
-    base_url: str = None
-
-    system_prompt: str = ""
-    json_mode: bool = False
-    seed: int = None
-
-    token_usage: list = field(default_factory=list)
-    request_limit: bool = False
-    rpm: RPM = field(default_factory=lambda: RPM(rpm=1000))
-    tpm: TPM = field(default_factory=lambda: TPM(tpm=50000))
-
-
-    def __post_init__(self):
-        assert self.api_key is not None, "Please provide api key to access openai api."
-        if self.api_key == "":
-            self.api_key = "none"
-        self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
-
-    def _pre_generate(self, text: str, history: List[str]) -> Dict:
-        kwargs = {
-            "temperature": self.temperature,
-            "top_p": self.topp,
-            "max_tokens": self.max_tokens,
-        }
-        if self.seed:
-            kwargs["seed"] = self.seed
-        if self.json_mode:
-            kwargs["response_format"] = {"type": "json_object"}
-
-        messages = []
-        if self.system_prompt:
-            messages.append({"role": "system", "content": self.system_prompt})
-        messages.append({"role": "user", "content": text})
-
-        if history:
-            assert len(history) % 2 == 0, "History should have even number of elements."
-            messages = history + messages
-
-        kwargs['messages']= messages
-        return kwargs
-
-
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
-    )
-    async def generate_topk_per_token(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
-        kwargs = self._pre_generate(text, history)
-        if self.topk_per_token > 0:
-            kwargs["logprobs"] = True
-            kwargs["top_logprobs"] = self.topk_per_token
-
-        # Limit max_tokens to 1 to avoid long completions
-        kwargs["max_tokens"] = 1
-
-        completion = await self.client.chat.completions.create( # pylint: disable=E1125
-            model=self.model_name,
-            **kwargs
-        )
-
-        tokens = get_top_response_tokens(completion)
-
-        return tokens
-
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
-    )
-    async def generate_answer(self, text: str, history: Optional[List[str]] = None, temperature: int = 0) -> str:
-        kwargs = self._pre_generate(text, history)
-        kwargs["temperature"] = temperature
-
-        prompt_tokens = 0
-        for message in kwargs['messages']:
-            prompt_tokens += len(Tokenizer().encode_string(message['content']))
-        estimated_tokens = prompt_tokens + kwargs['max_tokens']
-
-        if self.request_limit:
-            await self.rpm.wait(silent=True)
-            await self.tpm.wait(estimated_tokens, silent=True)
-
-        completion = await self.client.chat.completions.create( # pylint: disable=E1125
-            model=self.model_name,
-            **kwargs
-        )
-        if hasattr(completion, "usage"):
-            self.token_usage.append({
-                "prompt_tokens": completion.usage.prompt_tokens,
-                "completion_tokens": completion.usage.completion_tokens,
-                "total_tokens": completion.usage.total_tokens,
-            })
-        return completion.choices[0].message.content
-
-    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
-        raise NotImplementedError
diff --git a/graphgen/models/llm/tokenizer.py b/graphgen/models/llm/tokenizer.py
deleted file mode 100644
index 6a1c4b22..00000000
--- a/graphgen/models/llm/tokenizer.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-import tiktoken
-
-try:
-    from transformers import AutoTokenizer
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
-    AutoTokenizer = None
-    TRANSFORMERS_AVAILABLE = False
-
-
-def get_tokenizer(tokenizer_name: str = "cl100k_base"):
-    """
-    Get a tokenizer instance by name.
-
-    :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name
-    :return: tokenizer instance
-    """
-    if tokenizer_name in tiktoken.list_encoding_names():
-        return tiktoken.get_encoding(tokenizer_name)
-    if TRANSFORMERS_AVAILABLE:
-        try:
-            return AutoTokenizer.from_pretrained(tokenizer_name)
-        except Exception as e:
-            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
-    else:
-        raise ValueError("Hugging Face Transformers is not available, please install it first.")
-
-@dataclass
-class Tokenizer:
-    model_name: str = "cl100k_base"
-
-    def __post_init__(self):
-        self.tokenizer = get_tokenizer(self.model_name)
-
-    def encode_string(self, text: str) -> List[int]:
-        """
-        Encode text to tokens
-
-        :param text
-        :return: tokens
-        """
-        return self.tokenizer.encode(text)
-
-    def decode_tokens(self, tokens: List[int]) -> str:
-        """
-        Decode tokens to text
-
-        :param tokens
-        :return: text
-        """
-        return self.tokenizer.decode(tokens)
-
-    def chunk_by_token_size(
-        self, content: str, overlap_token_size=128, max_token_size=1024
-    ):
-        tokens = self.encode_string(content)
-        results = []
-        for index, start in enumerate(
-            range(0, len(tokens), max_token_size - overlap_token_size)
-        ):
-            chunk_content = self.decode_tokens(
-                tokens[start : start + max_token_size]
-            )
-            results.append(
-                {
-                    "tokens": min(max_token_size, len(tokens) - start),
-                    "content": chunk_content.strip(),
-                    "chunk_order_index": index,
-                }
-            )
-        return results
diff --git a/graphgen/models/llm/topk_token_model.py b/graphgen/models/llm/topk_token_model.py
deleted file mode 100644
index b7595cb1..00000000
--- a/graphgen/models/llm/topk_token_model.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import math
-from dataclasses import dataclass, field
-from typing import List, Union, Optional
-
-
-@dataclass
-class Token:
-    text: str
-    prob: float
-    top_candidates: List = field(default_factory=list)
-    ppl: Union[float, None] = field(default=None)
-
-    @property
-    def logprob(self) -> float:
-        return math.log(self.prob)
-
-
-@dataclass
-class TopkTokenModel:
-    do_sample: bool = False
-    temperature: float = 0
-    max_tokens: int = 4096
-    repetition_penalty: float = 1.05
-    num_beams: int = 1
-    topk: int = 50
-    topp: float = 0.95
-
-    topk_per_token: int = 5  # number of topk tokens to generate for each token
-
-    async def generate_topk_per_token(self, text: str) -> List[Token]:
-        """
-        Generate prob, text and candidates for each token of the model's output.
-        This function is used to visualize the inference process.
-        """
-        raise NotImplementedError
-
-    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
-        """
-        Generate prob and text for each token of the input text.
-        This function is used to visualize the ppl.
-        """
-        raise NotImplementedError
-
-    async def generate_answer(self, text: str, history: Optional[List[str]] = None) -> str:
-        """
-        Generate answer from the model.
-        """
-        raise NotImplementedError
diff --git a/graphgen/models/partitioner/__init__.py b/graphgen/models/partitioner/__init__.py
new file mode 100644
index 00000000..2e1bcb68
--- /dev/null
+++ b/graphgen/models/partitioner/__init__.py
@@ -0,0 +1,5 @@
+from .anchor_bfs_partitioner import AnchorBFSPartitioner
+from .bfs_partitioner import BFSPartitioner
+from .dfs_partitioner import DFSPartitioner
+from .ece_partitioner import ECEPartitioner
+from .leiden_partitioner import LeidenPartitioner
diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py
new file mode 100644
index 00000000..d75dc2e8
--- /dev/null
+++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py
@@ -0,0 +1,190 @@
+import random
+from collections import deque
+from typing import Any, Iterable, List, Literal, Set, Tuple, Union
+
+from graphgen.bases import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+
+from .bfs_partitioner import BFSPartitioner
+
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+
+
+class AnchorBFSPartitioner(BFSPartitioner):
+    """
+    Anchor BFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a node of a specified type as the anchor.
+    2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
+    3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
+    For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
+
+    Supports multiple anchor types for multi-omics data: anchor_type can be a single string or a list of strings.
+    When a list is provided, nodes matching any of the types in the list can serve as anchors.
+    """
+
+    def __init__(
+        self,
+        *,
+        anchor_type: Union[
+            Literal["image", "dna", "rna", "protein"],
+            List[Literal["dna", "rna", "protein"]],
+        ] = "image",
+        anchor_ids: Set[str] | None = None,
+    ) -> None:
+        super().__init__()
+        # Normalize anchor_type to always be a list for internal processing
+        if isinstance(anchor_type, str):
+            self.anchor_types = [anchor_type]
+        else:
+            self.anchor_types = list(anchor_type)
+        self.anchor_ids = anchor_ids
+
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> Iterable[Community]:
+        nodes = g.get_all_nodes()  # List[tuple[id, meta]]
+        edges = g.get_all_edges()  # List[tuple[u, v, meta]]
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+
+        anchors: Set[str] = self._pick_anchor_ids(nodes)
+        if not anchors:
+            return  # if no anchors, return nothing
+
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+
+        seeds = list(anchors)
+        random.shuffle(seeds)
+
+        for seed_node in seeds:
+            if seed_node in used_n:
+                continue
+            comm_n, comm_e = self._grow_community(
+                seed_node, adj, max_units_per_community, used_n, used_e
+            )
+            if comm_n or comm_e:
+                yield Community(id=seed_node, nodes=comm_n, edges=comm_e)
+
+    def _pick_anchor_ids(
+        self,
+        nodes: List[tuple[str, dict]],
+    ) -> Set[str]:
+        if self.anchor_ids is not None:
+            return self.anchor_ids
+
+        anchor_ids: Set[str] = set()
+        anchor_types_lower = [at.lower() for at in self.anchor_types]
+
+        for node_id, meta in nodes:
+            # Check if node matches any of the anchor types
+            matched = False
+
+            # Check 1: entity_type (for image, etc.)
+            node_type = str(meta.get("entity_type", "")).lower()
+            for anchor_type_lower in anchor_types_lower:
+                if anchor_type_lower in node_type:
+                    anchor_ids.add(node_id)
+                    matched = True
+                    break
+
+            if matched:
+                continue
+
+            # Check 2: molecule_type (for omics data: dna, rna, protein)
+            molecule_type = str(meta.get("molecule_type", "")).lower()
+            if molecule_type in anchor_types_lower:
+                anchor_ids.add(node_id)
+                continue
+
+            # Check 3: source_id prefix (for omics data: dna-, rna-, protein-)
+            source_id = str(meta.get("source_id", "")).lower()
+            for anchor_type_lower in anchor_types_lower:
+                if source_id.startswith(f"{anchor_type_lower}-"):
+                    anchor_ids.add(node_id)
+                    matched = True
+                    break
+
+            if matched:
+                continue
+
+            # Check 4: Check if source_id contains multiple IDs separated by <SEP>
+            if "<sep>" in source_id:
+                source_ids = source_id.split("<sep>")
+                for sid in source_ids:
+                    sid = sid.strip()
+                    for anchor_type_lower in anchor_types_lower:
+                        if sid.startswith(f"{anchor_type_lower}-"):
+                            anchor_ids.add(node_id)
+                            matched = True
+                            break
+                    if matched:
+                        break
+
+        return anchor_ids
+
+    @staticmethod
+    def _grow_community(
+        seed: str,
+        adj: dict[str, List[str]],
+        max_units: int,
+        used_n: set[str],
+        used_e: set[frozenset[str]],
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        """
+        Grow a community from the seed node using BFS.
+        :param seed: seed node id
+        :param adj: adjacency list
+        :param max_units: maximum number of units (nodes + edges) in the community
+        :param used_n: set of used node ids
+        :param used_e: set of used edge keys
+        :return: (list of node ids, list of edge tuples)
+        """
+        comm_n: List[str] = []
+        comm_e: List[Tuple[str, str]] = []
+        queue: deque[tuple[str, Any]] = deque([(NODE_UNIT, seed)])
+        cnt = 0
+
+        while queue and cnt < max_units:
+            k, it = queue.popleft()
+
+            if k == NODE_UNIT:
+                if it in used_n:
+                    continue
+                used_n.add(it)
+                comm_n.append(it)
+                cnt += 1
+                for nei in adj[it]:
+                    e_key = frozenset((it, nei))
+                    if e_key not in used_e:
+                        queue.append((EDGE_UNIT, e_key))
+            else:  # EDGE_UNIT
+                if it in used_e:
+                    continue
+                used_e.add(it)
+                # Convert frozenset to tuple for edge representation
+                # Note: Self-loops should be filtered during graph construction,
+                # but we handle edge cases defensively
+                try:
+                    u, v = tuple(it)
+                except ValueError:
+                    # Handle edge case: frozenset with unexpected number of elements
+                    # This should not happen if graph construction is correct
+                    edge_nodes = list(it)
+                    if len(edge_nodes) == 1:
+                        # Self-loop edge (should have been filtered during graph construction)
+                        u, v = edge_nodes[0], edge_nodes[0]
+                    else:
+                        # Invalid edge, skip it
+                        continue
+                comm_e.append((u, v))
+                cnt += 1
+                for n in it:
+                    if n not in used_n:
+                        queue.append((NODE_UNIT, n))
+
+        return comm_n, comm_e
diff --git a/graphgen/models/partitioner/bfs_partitioner.py b/graphgen/models/partitioner/bfs_partitioner.py
new file mode 100644
index 00000000..994e08e8
--- /dev/null
+++ b/graphgen/models/partitioner/bfs_partitioner.py
@@ -0,0 +1,76 @@
+import random
+from collections import deque
+from typing import Any, Iterable, List
+
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+
+
+class BFSPartitioner(BasePartitioner):
+    """
+    BFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a unit.
+    2. Expand the community using BFS until the max unit size is reached.
+    (A unit is a node or an edge.)
+    """
+
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> Iterable[Community]:
+        nodes = g.get_all_nodes()
+        edges = g.get_all_edges()
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+
+        units = [(NODE_UNIT, n[0]) for n in nodes] + [
+            (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
+        ]
+        random.shuffle(units)
+
+        for kind, seed in units:
+            if (kind == NODE_UNIT and seed in used_n) or (
+                kind == EDGE_UNIT and seed in used_e
+            ):
+                continue
+
+            comm_n: List[str] = []
+            comm_e: List[tuple[str, str]] = []
+            queue: deque[tuple[str, Any]] = deque([(kind, seed)])
+            cnt = 0
+
+            while queue and cnt < max_units_per_community:
+                k, it = queue.popleft()
+                if k == NODE_UNIT:
+                    if it in used_n:
+                        continue
+                    used_n.add(it)
+                    comm_n.append(it)
+                    cnt += 1
+                    for nei in adj[it]:
+                        e_key = frozenset((it, nei))
+                        if e_key not in used_e:
+                            queue.append((EDGE_UNIT, e_key))
+                else:
+                    if it in used_e:
+                        continue
+                    used_e.add(it)
+
+                    u, v = it
+                    comm_e.append((u, v))
+                    cnt += 1
+                    # push nodes that are not visited
+                    for n in it:
+                        if n not in used_n:
+                            queue.append((NODE_UNIT, n))
+
+            if comm_n or comm_e:
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)
diff --git a/graphgen/models/partitioner/dfs_partitioner.py b/graphgen/models/partitioner/dfs_partitioner.py
new file mode 100644
index 00000000..4d93ad7f
--- /dev/null
+++ b/graphgen/models/partitioner/dfs_partitioner.py
@@ -0,0 +1,74 @@
+import random
+from collections.abc import Iterable
+from typing import Any
+
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+
+
+class DFSPartitioner(BasePartitioner):
+    """
+    DFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a unit.
+    2. Random walk using DFS until the community reaches the max unit size.
+    (In GraphGen, a unit is defined as a node or an edge.)
+    """
+
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> Iterable[Community]:
+        nodes = g.get_all_nodes()
+        edges = g.get_all_edges()
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+
+        units = [(NODE_UNIT, n[0]) for n in nodes] + [
+            (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
+        ]
+        random.shuffle(units)
+
+        for kind, seed in units:
+            if (kind == NODE_UNIT and seed in used_n) or (
+                kind == EDGE_UNIT and seed in used_e
+            ):
+                continue
+
+            comm_n, comm_e = [], []
+            stack = [(kind, seed)]
+            cnt = 0
+
+            while stack and cnt < max_units_per_community:
+                k, it = stack.pop()
+                if k == NODE_UNIT:
+                    if it in used_n:
+                        continue
+                    used_n.add(it)
+                    comm_n.append(it)
+                    cnt += 1
+                    for nei in adj[it]:
+                        e_key = frozenset((it, nei))
+                        if e_key not in used_e:
+                            stack.append((EDGE_UNIT, e_key))
+                            break
+                else:
+                    if it in used_e:
+                        continue
+                    used_e.add(it)
+                    comm_e.append(tuple(it))
+                    cnt += 1
+                    # push neighboring nodes
+                    for n in it:
+                        if n not in used_n:
+                            stack.append((NODE_UNIT, n))
+
+            if comm_n or comm_e:
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)
diff --git a/graphgen/models/partitioner/ece_partitioner.py b/graphgen/models/partitioner/ece_partitioner.py
new file mode 100644
index 00000000..fcf776c7
--- /dev/null
+++ b/graphgen/models/partitioner/ece_partitioner.py
@@ -0,0 +1,156 @@
+import random
+from collections import deque
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+from tqdm import tqdm
+
+from graphgen.bases import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+from graphgen.models.partitioner.bfs_partitioner import BFSPartitioner
+
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+
+
+class ECEPartitioner(BFSPartitioner):
+    """
+    ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
+    We calculate ECE for units in KG (represented as 'comprehension loss')
+    and group units with similar ECE values into the same community.
+    1. Select a sampling strategy.
+    2. Choose a unit based on the sampling strategy.
+    2. Expand the community using BFS.
+    3. When expending, prefer to add units with the sampling strategy.
+    4. Stop when the max unit size is reached or the max input length is reached.
+    (A unit is a node or an edge.)
+    """
+
+    @staticmethod
+    def _sort_units(units: list, edge_sampling: str) -> list:
+        """
+        Sort units with edge sampling strategy
+
+        :param units: total units
+        :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+        :return: sorted units
+        """
+        if edge_sampling == "random":
+            random.shuffle(units)
+        elif edge_sampling == "min_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+            )
+        elif edge_sampling == "max_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+                reverse=True,
+            )
+        else:
+            raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+        return units
+
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 10,
+        min_units_per_community: int = 1,
+        max_tokens_per_community: int = 10240,
+        unit_sampling: str = "random",
+        **kwargs: Any,
+    ) -> Iterable[Community]:
+        nodes: List[Tuple[str, dict]] = g.get_all_nodes()
+        edges: List[Tuple[str, str, dict]] = g.get_all_edges()
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        node_dict = dict(nodes)
+        edge_dict = {frozenset((u, v)): d for u, v, d in edges}
+
+        all_units: List[Tuple[str, Any, dict]] = [
+            (NODE_UNIT, nid, d) for nid, d in nodes
+        ] + [(EDGE_UNIT, frozenset((u, v)), d) for u, v, d in edges]
+
+        used_n: Set[str] = set()
+        used_e: Set[frozenset[str]] = set()
+
+        all_units = self._sort_units(all_units, unit_sampling)
+
+        def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Optional[Community]:
+            nonlocal used_n, used_e
+
+            community_nodes: Dict[str, dict] = {}
+            community_edges: Dict[frozenset[str], dict] = {}
+            queue = deque()
+            token_sum = 0
+
+            def _add_unit(u):
+                nonlocal token_sum
+                t, i, d = u
+                if t == NODE_UNIT:  # node
+                    if i in used_n or i in community_nodes:
+                        return False
+                    community_nodes[i] = d
+                    used_n.add(i)
+                else:  # edge
+                    if i in used_e or i in community_edges:
+                        return False
+                    community_edges[i] = d
+                    used_e.add(i)
+                token_sum += d.get("length", 0)
+                return True
+
+            _add_unit(seed_unit)
+            queue.append(seed_unit)
+
+            # BFS
+            while queue:
+                if (
+                    len(community_nodes) + len(community_edges)
+                    >= max_units_per_community
+                    or token_sum >= max_tokens_per_community
+                ):
+                    break
+
+                cur_type, cur_id, _ = queue.popleft()
+
+                neighbors: List[Tuple[str, Any, dict]] = []
+                if cur_type == NODE_UNIT:
+                    for nb_id in adj.get(cur_id, []):
+                        e_key = frozenset((cur_id, nb_id))
+                        if e_key not in used_e and e_key not in community_edges:
+                            neighbors.append((EDGE_UNIT, e_key, edge_dict[e_key]))
+                else:
+                    for n_id in cur_id:
+                        if n_id not in used_n and n_id not in community_nodes:
+                            neighbors.append((NODE_UNIT, n_id, node_dict[n_id]))
+
+                neighbors = self._sort_units(neighbors, unit_sampling)
+                for nb in neighbors:
+                    if (
+                        len(community_nodes) + len(community_edges)
+                        >= max_units_per_community
+                        or token_sum >= max_tokens_per_community
+                    ):
+                        break
+                    if _add_unit(nb):
+                        queue.append(nb)
+
+            if len(community_nodes) + len(community_edges) < min_units_per_community:
+                return None
+
+            return Community(
+                id=seed_unit[1],
+                nodes=list(community_nodes.keys()),
+                edges=[(u, v) for (u, v), _ in community_edges.items()],
+            )
+
+        for unit in tqdm(all_units, desc="ECE partition"):
+            utype, uid, _ = unit
+            if (utype == NODE_UNIT and uid in used_n) or (
+                utype == EDGE_UNIT and uid in used_e
+            ):
+                continue
+            comm = _grow_community(unit)
+            if comm:
+                yield comm
diff --git a/graphgen/models/partitioner/leiden_partitioner.py b/graphgen/models/partitioner/leiden_partitioner.py
new file mode 100644
index 00000000..b62b8544
--- /dev/null
+++ b/graphgen/models/partitioner/leiden_partitioner.py
@@ -0,0 +1,114 @@
+from collections import defaultdict
+from typing import Any, Dict, List, Set, Tuple
+
+import igraph as ig
+from leidenalg import ModularityVertexPartition, find_partition
+
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+
+
+class LeidenPartitioner(BasePartitioner):
+    """
+    Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
+    """
+
+    def partition(
+        self,
+        g: BaseGraphStorage,
+        max_size: int = 20,
+        use_lcc: bool = False,
+        random_seed: int = 42,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Leiden Partition follows these steps:
+        1. export the graph from graph storage
+        2. use the leiden algorithm to detect communities, get {node: community_id}
+        3. split large communities if max_size is given
+        4. convert {node: community_id} to List[Community]
+        :param g
+        :param max_size: maximum size of each community, if None or <=0, no limit
+        :param use_lcc: whether to use the largest connected component only
+        :param random_seed
+        :param kwargs: other parameters for the leiden algorithm
+        :return:
+        """
+        nodes = g.get_all_nodes()  # List[Tuple[str, dict]]
+        edges = g.get_all_edges()  # List[Tuple[str, str, dict]]
+
+        node2cid: Dict[str, int] = self._run_leiden(nodes, edges, use_lcc, random_seed)
+
+        if max_size is not None and max_size > 0:
+            node2cid = self._split_communities(node2cid, max_size)
+
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
+
+        communities: List[Community] = []
+        for cid, nodes in cid2nodes.items():
+            node_set: Set[str] = set(nodes)
+            comm_edges: List[Tuple[str, str]] = [
+                (u, v) for u, v, _ in edges if u in node_set and v in node_set
+            ]
+            communities.append(Community(id=cid, nodes=nodes, edges=comm_edges))
+        return communities
+
+    @staticmethod
+    def _run_leiden(
+        nodes: List[Tuple[str, dict]],
+        edges: List[Tuple[str, str, dict]],
+        use_lcc: bool = False,
+        random_seed: int = 42,
+    ) -> Dict[str, int]:
+        # build igraph
+        ig_graph = ig.Graph.TupleList(((u, v) for u, v, _ in edges), directed=False)
+
+        # remove isolated nodes
+        ig_graph.delete_vertices(ig_graph.vs.select(_degree_eq=0))
+
+        node2cid: Dict[str, int] = {}
+        if use_lcc:
+            lcc = ig_graph.components().giant()
+            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
+            for part_id, cluster in enumerate(partition):
+                for v in cluster:
+                    node2cid[lcc.vs[v]["name"]] = part_id
+        else:
+            offset = 0
+            for component in ig_graph.components():
+                subgraph = ig_graph.induced_subgraph(component)
+                partition = find_partition(
+                    subgraph, ModularityVertexPartition, seed=random_seed
+                )
+                for part_id, cluster in enumerate(partition):
+                    for v in cluster:
+                        original_node = subgraph.vs[v]["name"]
+                        node2cid[original_node] = part_id + offset
+                offset += len(partition)
+        return node2cid
+
+    @staticmethod
+    def _split_communities(node2cid: Dict[str, int], max_size: int) -> Dict[str, int]:
+        """
+        Split communities larger than max_size into smaller sub-communities.
+        """
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
+
+        new_mapping: Dict[str, int] = {}
+        new_cid = 0
+        for nodes in cid2nodes.values():
+            if len(nodes) <= max_size:
+                for n in nodes:
+                    new_mapping[n] = new_cid
+                new_cid += 1
+            else:
+                for start in range(0, len(nodes), max_size):
+                    chunk = nodes[start : start + max_size]
+                    for n in chunk:
+                        new_mapping[n] = new_cid
+                    new_cid += 1
+        return new_mapping
diff --git a/graphgen/models/reader/__init__.py b/graphgen/models/reader/__init__.py
new file mode 100644
index 00000000..220460c3
--- /dev/null
+++ b/graphgen/models/reader/__init__.py
@@ -0,0 +1,7 @@
+from .csv_reader import CSVReader
+from .json_reader import JSONReader
+from .parquet_reader import ParquetReader
+from .pdf_reader import PDFReader
+from .pickle_reader import PickleReader
+from .rdf_reader import RDFReader
+from .txt_reader import TXTReader
diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py
new file mode 100644
index 00000000..a0343d97
--- /dev/null
+++ b/graphgen/models/reader/csv_reader.py
@@ -0,0 +1,28 @@
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
+
+from graphgen.bases.base_reader import BaseReader
+
+
+class CSVReader(BaseReader):
+    """
+    Reader for CSV files.
+    Columns:
+        - type: The type of the document (e.g., "text", "image", etc.)
+        - if type is "text", "content" column must be present.
+    """
+
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
+        """
+        Read CSV files and return Ray Dataset.
+
+        :param input_path: Path to CSV file or list of CSV files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+
+        ds = ray.data.read_csv(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
new file mode 100644
index 00000000..a25abee8
--- /dev/null
+++ b/graphgen/models/reader/json_reader.py
@@ -0,0 +1,111 @@
+import json
+import os
+from typing import Any, Dict, Iterator, List, Union
+
+import ray
+import ray.data
+
+from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
+
+
+class JSONReader(BaseReader):
+    """
+    Reader for JSON and JSONL files.
+    Columns:
+        - type: The type of the document (e.g., "text", "image", etc.)
+        - if type is "text", "content" column must be present.
+    """
+
+    def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset:
+        """
+        Read JSON file and return Ray Dataset.
+        :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+        if self.modalities and len(self.modalities) >= 2:
+            ds: ray.data.Dataset = ray.data.from_items([])
+            for file in input_path if isinstance(input_path, list) else [input_path]:
+                data = []
+                if file.endswith(".jsonl"):
+                    with open(file, "r", encoding="utf-8") as f:
+                        for line in f:
+                            item = json.loads(line)
+                            data.append(item)
+                else:
+                    with open(file, "r", encoding="utf-8") as f:
+                        data = json.load(f)
+                        data = self._unify_schema(data)
+                file_ds: ray.data.Dataset = ray.data.from_items(data)
+                ds = ds.union(file_ds)  # type: ignore
+        else:
+            ds = ray.data.read_json(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
+
+    def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
+        """
+        Stream read JSONL files line by line without loading entire file into memory.
+        Returns an iterator that yields filtered documents.
+
+        :param file_path: Path to the JSONL file.
+        :return: Iterator of dictionaries containing the data.
+        """
+        if not file_path.endswith(".jsonl"):
+            raise ValueError("read_stream only supports JSONL files, not JSON files")
+
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    doc = json.loads(line)
+                    assert "type" in doc, f"Missing 'type' in document: {doc}"
+                    if doc.get("type") == "text" and self.text_column not in doc:
+                        raise ValueError(
+                            f"Missing '{self.text_column}' in document: {doc}"
+                        )
+
+                    # Apply filtering logic inline (similar to BaseReader.filter)
+                    if doc.get("type") == "text":
+                        content = doc.get(self.text_column, "").strip()
+                        if content:
+                            yield doc
+                    elif doc.get("type") in ("image", "table", "equation"):
+                        img_path = doc.get("img_path")
+                        if self._image_exists(img_path):
+                            yield doc
+                    else:
+                        yield doc
+                except json.JSONDecodeError as e:
+                    logger.error("Error decoding JSON line: %s. Error: %s", line, e)
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            import requests
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except Exception:
+            return False
+
+    @staticmethod
+    def _unify_schema(data):
+        """
+        Unify schema for JSON data.
+        """
+        for item in data:
+            if "content" in item and isinstance(item["content"], dict):
+                item["content"] = json.dumps(item["content"])
+        return data
diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py
new file mode 100644
index 00000000..c4979ad3
--- /dev/null
+++ b/graphgen/models/reader/jsonl_reader.py
@@ -0,0 +1,87 @@
+import json
+import os
+from typing import Any, Dict, Iterator, List, Union
+
+from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
+
+
+class JSONLReader(BaseReader):
+    """
+    Reader for JSONL files.
+    Columns:
+        - type: The type of the document (e.g., "text", "image", etc.)
+        - if type is "text", "content" column must be present.
+    """
+
+    def read(self, input_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
+        docs = []
+        # Handle both single file and list of files
+        file_paths = input_path if isinstance(input_path, list) else [input_path]
+        for file_path in file_paths:
+            with open(file_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    try:
+                        doc = json.loads(line)
+                        assert "type" in doc, f"Missing 'type' in document: {doc}"
+                        if doc.get("type") == "text" and self.text_column not in doc:
+                            raise ValueError(
+                                f"Missing '{self.text_column}' in document: {doc}"
+                            )
+                        docs.append(doc)
+                    except json.JSONDecodeError as e:
+                        logger.error("Error decoding JSON line: %s. Error: %s", line, e)
+        return self.filter(docs)
+
+    def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
+        """
+        Stream read JSONL files line by line without loading entire file into memory.
+        Returns an iterator that yields filtered documents.
+
+        :param file_path: Path to the JSONL file.
+        :return: Iterator of dictionaries containing the data.
+        """
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    doc = json.loads(line)
+                    assert "type" in doc, f"Missing 'type' in document: {doc}"
+                    if doc.get("type") == "text" and self.text_column not in doc:
+                        raise ValueError(
+                            f"Missing '{self.text_column}' in document: {doc}"
+                        )
+
+                    # Apply filtering logic inline (similar to BaseReader.filter)
+                    if doc.get("type") == "text":
+                        content = doc.get(self.text_column, "").strip()
+                        if content:
+                            yield doc
+                    elif doc.get("type") in ("image", "table", "equation"):
+                        img_path = doc.get("img_path")
+                        if self._image_exists(img_path):
+                            yield doc
+                    else:
+                        yield doc
+                except json.JSONDecodeError as e:
+                    logger.error("Error decoding JSON line: %s. Error: %s", line, e)
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            import requests
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except Exception:
+            return False
diff --git a/graphgen/models/reader/parquet_reader.py b/graphgen/models/reader/parquet_reader.py
new file mode 100644
index 00000000..dd289e31
--- /dev/null
+++ b/graphgen/models/reader/parquet_reader.py
@@ -0,0 +1,30 @@
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
+
+from graphgen.bases.base_reader import BaseReader
+
+
+class ParquetReader(BaseReader):
+    """
+    Read parquet files, requiring the schema to be restored to List[Dict[str, Any]].
+    Columns:
+    - type: The type of the document (e.g., "text", "image", etc.)
+    - if type is "text", "content" column must be present.
+    """
+
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
+        """
+        Read Parquet files using Ray Data.
+
+        :param input_path: Path to Parquet file or list of Parquet files.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+
+        ds = ray.data.read_parquet(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py
new file mode 100644
index 00000000..55dab30b
--- /dev/null
+++ b/graphgen/models/reader/pdf_reader.py
@@ -0,0 +1,248 @@
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import ray
+from ray.data import Dataset
+
+from graphgen.bases.base_reader import BaseReader
+from graphgen.models.reader.txt_reader import TXTReader
+from graphgen.utils import logger, pick_device
+
+
+class PDFReader(BaseReader):
+    """
+    PDF files are converted using MinerU, see [MinerU](https://github.com/opendatalab/MinerU).
+    After conversion, the resulting markdown file is parsed into text, images, tables, and formulas which can be used
+    for multi-modal graph generation.
+    """
+
+    def __init__(
+        self,
+        *,
+        output_dir: Optional[Union[str, Path]] = None,
+        method: str = "auto",  # auto | txt | ocr
+        lang: Optional[str] = None,  # ch / en / ja / ...
+        backend: Optional[
+            str
+        ] = None,  # pipeline | vlm-transformers | vlm-sglang-engine | vlm-sglang-client
+        device: Optional[str] = "auto",  # cpu | cuda | cuda:0 | npu | mps | auto
+        source: Optional[str] = None,  # huggingface | modelscope | local
+        vlm_url: Optional[str] = None,  # 当 backend=vlm-sglang-client 时必填
+        start_page: Optional[int] = None,  # 0-based
+        end_page: Optional[int] = None,  # 0-based， inclusive
+        formula: bool = True,
+        table: bool = True,
+        return_assets: bool = True,
+        **other_mineru_kwargs: Any,
+    ):
+        super().__init__()
+        self.output_dir = os.path.join(output_dir, "mineru") if output_dir else None
+
+        if device == "auto":
+            device = pick_device()
+
+        self._default_kwargs: Dict[str, Any] = {
+            "method": method,
+            "lang": lang,
+            "backend": backend,
+            "device": device,
+            "source": source,
+            "vlm_url": vlm_url,
+            "start_page": start_page,
+            "end_page": end_page,
+            "formula": formula,
+            "table": table,
+            **other_mineru_kwargs,
+        }
+        self._default_kwargs = {
+            k: v for k, v in self._default_kwargs.items() if v is not None
+        }
+        self.return_assets = return_assets
+        self.parser = MinerUParser()
+        self.txt_reader = TXTReader()
+
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        **override,
+    ) -> Dataset:
+
+        # Ensure input_path is a list
+        if isinstance(input_path, str):
+            input_path = [input_path]
+
+        paths_ds = ray.data.from_items(input_path)
+
+        def process_pdf(row: Dict[str, Any]) -> List[Dict[str, Any]]:
+            try:
+                pdf_path = row["item"]
+                kwargs = {**self._default_kwargs, **override}
+                return self._call_mineru(Path(pdf_path), kwargs)
+            except Exception as e:
+                logger.error("Failed to process %s: %s", row, e)
+                return []
+
+        docs_ds = paths_ds.flat_map(process_pdf)
+        docs_ds = docs_ds.filter(self._should_keep_item)
+
+        return docs_ds
+
+    def _call_mineru(
+        self, pdf_path: Path, kwargs: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        output_dir: Optional[str] = None
+        if self.output_dir:
+            output_dir = str(self.output_dir)
+
+        return self.parser.parse_pdf(pdf_path, output_dir=output_dir, **kwargs)
+
+    def _locate_md(self, pdf_path: Path, kwargs: Dict[str, Any]) -> Optional[Path]:
+        out_dir = (
+            Path(self.output_dir) if self.output_dir else Path(tempfile.gettempdir())
+        )
+        method = kwargs.get("method", "auto")
+        backend = kwargs.get("backend", "")
+        if backend.startswith("vlm-"):
+            method = "vlm"
+
+        candidate = Path(
+            os.path.join(out_dir, pdf_path.stem, method, f"{pdf_path.stem}.md")
+        )
+        if candidate.exists():
+            return candidate
+        candidate = Path(os.path.join(out_dir, f"{pdf_path.stem}.md"))
+        if candidate.exists():
+            return candidate
+        return None
+
+
+class MinerUParser:
+    def __init__(self) -> None:
+        self._check_bin()
+
+    @staticmethod
+    def parse_pdf(
+        pdf_path: Union[str, Path],
+        output_dir: Optional[Union[str, Path]] = None,
+        method: str = "auto",
+        device: str = "cpu",
+        **kw: Any,
+    ) -> List[Dict[str, Any]]:
+        pdf = Path(pdf_path).expanduser().resolve()
+        if not pdf.is_file():
+            raise FileNotFoundError(pdf)
+
+        out = (
+            Path(output_dir) if output_dir else Path(tempfile.mkdtemp(prefix="mineru_"))
+        )
+        out.mkdir(parents=True, exist_ok=True)
+
+        cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method)
+        if cached is not None:
+            return cached
+
+        MinerUParser._run_mineru(pdf, out, method, device, **kw)
+
+        cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method)
+        return cached if cached is not None else []
+
+    @staticmethod
+    def _try_load_cached_result(
+        out_dir: str, pdf_stem: str, method: str
+    ) -> Optional[List[Dict[str, Any]]]:
+        """
+        try to load cached json result from MinerU output.
+        :param out_dir:
+        :param pdf_stem:
+        :param method:
+        :return:
+        """
+        json_file = os.path.join(
+            out_dir, pdf_stem, method, f"{pdf_stem}_content_list.json"
+        )
+        if not os.path.exists(json_file):
+            return None
+
+        try:
+            with open(json_file, encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.warning("Failed to load cached MinerU result: %s", exc)
+            return None
+
+        base = os.path.dirname(json_file)
+        results = []
+        for it in data:
+            for key in ("img_path", "table_img_path", "equation_img_path"):
+                rel_path = it.get(key)
+                if rel_path:
+                    it[key] = str(Path(base).joinpath(rel_path).resolve())
+            if it["type"] == "text":
+                it["content"] = it["text"]
+                del it["text"]
+            for key in ("page_idx", "bbox", "text_level"):
+                if it.get(key) is not None:
+                    del it[key]
+            results.append(it)
+        return results
+
+    @staticmethod
+    def _run_mineru(
+        pdf: Path,
+        out: Path,
+        method: str,
+        device: str,
+        **kw: Any,
+    ) -> None:
+        cmd = [
+            "mineru",
+            "-p",
+            str(pdf),
+            "-o",
+            str(out),
+            "-m",
+            method,
+            "-d",
+            device,
+        ]
+        for k, v in kw.items():
+            if v is None:
+                continue
+            if isinstance(v, bool):
+                cmd += [f"--{k}", str(v).lower()]
+            else:
+                cmd += [f"--{k}", str(v)]
+
+        logger.info("Parsing PDF with MinerU: %s", pdf)
+        logger.debug("Running MinerU command: %s", " ".join(cmd))
+
+        proc = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            encoding="utf-8",
+            errors="ignore",
+            check=False,
+        )
+        if proc.returncode != 0:
+            raise RuntimeError(f"MinerU failed: {proc.stderr or proc.stdout}")
+
+    @staticmethod
+    def _check_bin() -> None:
+        try:
+            subprocess.run(
+                ["mineru", "--version"],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+        except (subprocess.CalledProcessError, FileNotFoundError) as exc:
+            raise RuntimeError(
+                "MinerU is not installed or not found in PATH. Please install it from pip: \n"
+                "pip install -U 'mineru[core]'"
+            ) from exc
diff --git a/graphgen/models/reader/pickle_reader.py b/graphgen/models/reader/pickle_reader.py
new file mode 100644
index 00000000..6e3d1949
--- /dev/null
+++ b/graphgen/models/reader/pickle_reader.py
@@ -0,0 +1,78 @@
+import pickle
+from typing import List, Union
+
+import pandas as pd
+import ray
+from ray.data import Dataset
+
+from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
+
+
+class PickleReader(BaseReader):
+    """
+    Read pickle files, requiring the schema to be restored to List[Dict[str, Any]].
+    Each pickle file should contain a list of dictionaries with at least:
+    - type: The type of the document (e.g., "text", "image", etc.)
+    - if type is "text", "content" column must be present.
+
+    Note: Uses ray.data.read_binary_files as ray.data.read_pickle is not available.
+    For Ray >= 2.5, consider using read_pickle if available in your version.
+    """
+
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read Pickle files using Ray Data.
+
+        :param input_path: Path to pickle file or list of pickle files.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+
+        # Use read_binary_files as a reliable alternative to read_pickle
+        ds = ray.data.read_binary_files(input_path, include_paths=True)
+
+        # Deserialize pickle files and flatten into individual records
+        def deserialize_batch(batch: pd.DataFrame) -> pd.DataFrame:
+            all_records = []
+            for _, row in batch.iterrows():
+                try:
+                    # Load pickle data from bytes
+                    data = pickle.loads(row["bytes"])
+
+                    # Validate structure
+                    if not isinstance(data, list):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list, got {type(data)}"
+                        )
+                        continue
+
+                    if not all(isinstance(item, dict) for item in data):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list of dictionaries"
+                        )
+                        continue
+
+                    # Flatten: each dict in the list becomes a separate row
+                    all_records.extend(data)
+                except Exception as e:
+                    logger.error(
+                        "Failed to deserialize pickle file %s: %s", row["path"], str(e)
+                    )
+                    continue
+
+            return pd.DataFrame(all_records)
+
+        # Apply deserialization and flattening
+        ds = ds.map_batches(deserialize_batch, batch_format="pandas")
+
+        # Validate the schema
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+
+        # Filter valid items
+        ds = ds.filter(self._should_keep_item)
+        return ds
diff --git a/graphgen/models/reader/rdf_reader.py b/graphgen/models/reader/rdf_reader.py
new file mode 100644
index 00000000..9670107a
--- /dev/null
+++ b/graphgen/models/reader/rdf_reader.py
@@ -0,0 +1,128 @@
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+import ray
+import rdflib
+from ray.data import Dataset
+from rdflib import Literal
+from rdflib.util import guess_format
+
+from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
+
+
+class RDFReader(BaseReader):
+    """
+    Reader for RDF files that extracts triples and represents them as dictionaries.
+
+    Uses Ray Data for distributed processing of multiple RDF files.
+    """
+
+    def __init__(self, *, text_column: str = "content", **kwargs):
+        """
+        Initialize RDFReader.
+
+        :param text_column: The column name for text content (default: "content").
+        """
+        super().__init__(**kwargs)
+        self.text_column = text_column
+
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read RDF file(s) using Ray Data.
+
+        :param input_path: Path to RDF file or list of RDF files.
+        :return: Ray Dataset containing extracted documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+
+        # Ensure input_path is a list to prevent Ray from splitting string into characters
+        if isinstance(input_path, str):
+            input_path = [input_path]
+
+        # Create dataset from file paths
+        paths_ds = ray.data.from_items(input_path)
+
+        def process_rdf(row: Dict[str, Any]) -> List[Dict[str, Any]]:
+            """Process a single RDF file and return list of documents."""
+            try:
+                file_path = row["item"]
+                return self._parse_rdf_file(Path(file_path))
+            except Exception as e:
+                logger.error(
+                    "Failed to process RDF file %s: %s", row.get("item", "unknown"), e
+                )
+                return []
+
+        # Process files in parallel and flatten results
+        docs_ds = paths_ds.flat_map(process_rdf)
+
+        # Filter valid documents
+        docs_ds = docs_ds.filter(self._should_keep_item)
+
+        return docs_ds
+
+    def _parse_rdf_file(self, file_path: Path) -> List[Dict[str, Any]]:
+        """
+        Parse a single RDF file and extract documents.
+
+        :param file_path: Path to RDF file.
+        :return: List of document dictionaries.
+        """
+        if not file_path.is_file():
+            raise FileNotFoundError(f"RDF file not found: {file_path}")
+
+        g = rdflib.Graph()
+        fmt = guess_format(str(file_path))
+
+        try:
+            g.parse(str(file_path), format=fmt)
+        except Exception as e:
+            raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e
+
+        docs: List[Dict[str, Any]] = []
+
+        # Process each unique subject in the RDF graph
+        for subj in set(g.subjects()):
+            literals = []
+            props = {}
+
+            # Extract all triples for this subject
+            for _, pred, obj in g.triples((subj, None, None)):
+                pred_str = str(pred)
+                obj_str = str(obj)
+
+                # Collect literal values as text content
+                if isinstance(obj, Literal):
+                    literals.append(obj_str)
+
+                # Store all properties (including non-literals)
+                props.setdefault(pred_str, []).append(obj_str)
+
+            # Join all literal values as the text content
+            text = " ".join(literals).strip()
+            if not text:
+                logger.warning(
+                    "Subject %s in %s has no literal values; document will have empty '%s' field.",
+                    subj,
+                    file_path,
+                    self.text_column,
+                )
+
+            # Create document dictionary
+            doc = {
+                "id": str(subj),
+                self.text_column: text,
+                "properties": props,
+                "source_file": str(file_path),
+            }
+            docs.append(doc)
+
+        if not docs:
+            logger.warning("RDF file %s contains no valid documents.", file_path)
+
+        return docs
diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py
new file mode 100644
index 00000000..51a47de2
--- /dev/null
+++ b/graphgen/models/reader/txt_reader.py
@@ -0,0 +1,32 @@
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
+
+from graphgen.bases.base_reader import BaseReader
+
+
+class TXTReader(BaseReader):
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read text files from the specified input path.
+        :param input_path: Path to the input text file or list of text files.
+        :return: Ray Dataset containing the read text data.
+        """
+        docs_ds = ray.data.read_binary_files(
+            input_path,
+            include_paths=False,
+        )
+
+        docs_ds = docs_ds.map(
+            lambda row: {
+                "type": "text",
+                self.text_column: row["bytes"].decode("utf-8"),
+            }
+        )
+
+        docs_ds = docs_ds.filter(self._should_keep_item)
+        return docs_ds
diff --git a/graphgen/models/strategy/__init__.py b/graphgen/models/searcher/__init__.py
similarity index 100%
rename from graphgen/models/strategy/__init__.py
rename to graphgen/models/searcher/__init__.py
diff --git a/graphgen/models/text/__init__.py b/graphgen/models/searcher/db/__init__.py
similarity index 100%
rename from graphgen/models/text/__init__.py
rename to graphgen/models/searcher/db/__init__.py
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
new file mode 100644
index 00000000..efaacf4b
--- /dev/null
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -0,0 +1,577 @@
+import asyncio
+import os
+import re
+import subprocess
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from http.client import IncompleteRead
+from typing import Dict, Optional
+
+from Bio import Entrez, SeqIO
+from Bio.Blast import NCBIWWW, NCBIXML
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
+
+
+# ensure only one NCBI request at a time
+_blast_lock = asyncio.Lock()
+
+
+class NCBISearch(BaseSearcher):
+    """
+    NCBI Search client to search DNA/GenBank/Entrez databases.
+    1) Get the gene/DNA by accession number or gene ID.
+    2) Search with keywords or gene names (fuzzy search).
+    3) Search with FASTA sequence (BLAST search for DNA sequences).
+
+    API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
+    Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
+    """
+
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "nt_db",
+        email: str = "email@example.com",
+        api_key: str = "",
+        tool: str = "GraphGen",
+        blast_num_threads: int = 4,
+        working_dir: str = "cache",
+    ):
+        """
+        Initialize the NCBI Search client.
+
+        Args:
+            use_local_blast (bool): Whether to use local BLAST database.
+            local_blast_db (str): Path to the local BLAST database.
+            email (str): Email address for NCBI API requests.
+            api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
+            tool (str): Tool name for NCBI API requests.
+            blast_num_threads (int): Number of threads for BLAST search.
+            working_dir (str): Working directory for log files.
+        """
+        super().__init__(working_dir=working_dir)
+        Entrez.timeout = 60  # 60 seconds timeout
+        Entrez.email = email
+        Entrez.tool = tool
+        if api_key:
+            Entrez.api_key = api_key
+        Entrez.max_tries = 10 if api_key else 3
+        Entrez.sleep_between_tries = 5
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        self.blast_num_threads = blast_num_threads
+        if self.use_local_blast:
+            # Check for single-file database (.nhr) or multi-file database (.00.nhr)
+            db_exists = (
+                os.path.isfile(f"{self.local_blast_db}.nhr") or
+                os.path.isfile(f"{self.local_blast_db}.00.nhr")
+            )
+            if not db_exists:
+                self.logger.error("Local BLAST database files not found. Please check the path.")
+                self.logger.error("Expected: %s.nhr or %s.00.nhr", self.local_blast_db, self.local_blast_db)
+                self.use_local_blast = False
+
+    @staticmethod
+    def _nested_get(data: dict, *keys, default=None):
+        """Safely traverse nested dictionaries."""
+        for key in keys:
+            if not isinstance(data, dict):
+                return default
+            data = data.get(key, default)
+        return data
+
+    @staticmethod
+    def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
+        """Infer molecule_type_detail from accession prefix or gene type."""
+        if accession:
+            # Map accession prefixes to molecule types
+            prefix_map = {
+                ("NM_", "XM_"): "mRNA",
+                ("NC_", "NT_"): "genomic DNA",
+                ("NR_", "XR_"): "RNA",
+                ("NG_",): "genomic region",
+            }
+            for prefixes, mol_type in prefix_map.items():
+                if accession.startswith(prefixes):
+                    return mol_type
+        # Fallback: infer from gene type if available
+        if gene_type is not None:
+            gene_type_map = {
+                3: "rRNA",
+                4: "tRNA",
+                5: "snRNA",
+                6: "ncRNA",
+            }
+            return gene_type_map.get(gene_type)
+        return None
+
+    def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
+        """
+        Convert an Entrez gene record to a dictionary.
+        All extraction logic is inlined for maximum clarity and performance.
+        """
+        if not gene_record:
+            raise ValueError("Empty gene record")
+
+        data = gene_record[0]
+        locus = (data.get("Entrezgene_locus") or [{}])[0]
+
+        # Extract common nested paths once
+        gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={})
+        biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={})
+
+        # Process synonyms
+        synonyms_raw = gene_ref.get("Gene-ref_syn", [])
+        gene_synonyms = []
+        if isinstance(synonyms_raw, list):
+            for syn in synonyms_raw:
+                gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn))
+        elif synonyms_raw:
+            gene_synonyms.append(str(synonyms_raw))
+
+        # Extract location info
+        label = locus.get("Gene-commentary_label", "")
+        chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
+
+        seq_interval = self._nested_get(
+            locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
+        )
+        genomic_location = (
+            f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}"
+            if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to')
+            else None
+        )
+
+        # Extract representative accession (prefer type 3 = mRNA/transcript)
+        representative_accession = next(
+            (
+                product.get("Gene-commentary_accession")
+                for product in locus.get("Gene-commentary_products", [])
+                if product.get("Gene-commentary_type") == "3"
+            ),
+            None,
+        )
+        # Fallback: if no type 3 accession, try any available accession
+        if not representative_accession:
+            representative_accession = next(
+                (
+                    product.get("Gene-commentary_accession")
+                    for product in locus.get("Gene-commentary_products", [])
+                    if product.get("Gene-commentary_accession")
+                ),
+                None,
+            )
+
+        # Extract function
+        function = data.get("Entrezgene_summary") or next(
+            (
+                comment.get("Gene-commentary_comment")
+                for comment in data.get("Entrezgene_comments", [])
+                if isinstance(comment, dict)
+                and "function" in str(comment.get("Gene-commentary_heading", "")).lower()
+            ),
+            None,
+        )
+
+        return {
+            "molecule_type": "DNA",
+            "database": "NCBI",
+            "id": gene_id,
+            "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+            "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+            "organism": self._nested_get(
+                biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A"
+            ),
+            "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+            "gene_synonyms": gene_synonyms or None,
+            "gene_type": {
+                "1": "protein-coding",
+                "2": "pseudo",
+                "3": "rRNA",
+                "4": "tRNA",
+                "5": "snRNA",
+                "6": "ncRNA",
+                "7": "other",
+            }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"),
+            "chromosome": chromosome_match.group(1) if chromosome_match else None,
+            "genomic_location": genomic_location,
+            "function": function,
+            # Fields from accession-based queries
+            "title": None,
+            "sequence": None,
+            "sequence_length": None,
+            "gene_id": gene_id,
+            "molecule_type_detail": self._infer_molecule_type_detail(
+                representative_accession, data.get("Entrezgene_type")
+            ),
+            "_representative_accession": representative_accession,
+        }
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
+        """Get gene information by Gene ID."""
+        def _extract_metadata_from_genbank(result: dict, accession: str):
+            """Extract metadata from GenBank format (title, features, organism, etc.)."""
+            with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
+                record = SeqIO.read(handle, "genbank")
+
+                result["title"] = record.description
+                result["molecule_type_detail"] = self._infer_molecule_type_detail(accession) or "N/A"
+
+                for feature in record.features:
+                    if feature.type == "source":
+                        if 'chromosome' in feature.qualifiers:
+                            result["chromosome"] = feature.qualifiers['chromosome'][0]
+
+                        if feature.location:
+                            start = int(feature.location.start) + 1
+                            end = int(feature.location.end)
+                            result["genomic_location"] = f"{start}-{end}"
+
+                        break
+
+                if not result.get("organism") and 'organism' in record.annotations:
+                    result["organism"] = record.annotations['organism']
+
+            return result
+
+        def _extract_sequence_from_fasta(result: dict, accession: str):
+            """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
+            try:
+                with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
+                    fasta_record = SeqIO.read(fasta_handle, "fasta")
+                    result["sequence"] = str(fasta_record.seq)
+                    result["sequence_length"] = len(fasta_record.seq)
+            except Exception as fasta_exc:
+                self.logger.warning(
+                    "Failed to extract sequence from accession %s using FASTA format: %s",
+                    accession, fasta_exc
+                )
+                result["sequence"] = None
+                result["sequence_length"] = None
+            return result
+
+        def _extract_sequence(result: dict, accession: str):
+            """
+            Extract sequence using the appropriate method based on configuration.
+            If use_local_blast=True, use local database. Otherwise, use NCBI API.
+            Always fetches sequence (no option to skip).
+            """
+            # If using local BLAST, use local database
+            if self.use_local_blast:
+                sequence = self._extract_sequence_from_local_db(accession)
+
+                if sequence:
+                    result["sequence"] = sequence
+                    result["sequence_length"] = len(sequence)
+                else:
+                    # Failed to extract from local DB, set to None (no fallback to API)
+                    result["sequence"] = None
+                    result["sequence_length"] = None
+                    self.logger.warning(
+                        "Failed to extract sequence from local DB for accession %s. "
+                        "Not falling back to NCBI API as use_local_blast=True.",
+                        accession
+                    )
+            else:
+                # Use NCBI API to fetch sequence
+                result = _extract_sequence_from_fasta(result, accession)
+
+            return result
+
+        try:
+            with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
+                gene_record = Entrez.read(handle)
+
+            if not gene_record:
+                return None
+
+            result = self._gene_record_to_dict(gene_record, gene_id)
+
+            if accession := (preferred_accession or result.get("_representative_accession")):
+                result = _extract_metadata_from_genbank(result, accession)
+                # Extract sequence using appropriate method
+                result = _extract_sequence(result, accession)
+
+            result.pop("_representative_accession", None)
+            return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as exc:
+            self.logger.error("Gene ID %s not found: %s", gene_id, exc)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        """Get sequence information by accession number."""
+        def _extract_gene_id(link_handle):
+            """Extract GeneID from elink results."""
+            links = Entrez.read(link_handle)
+            if not links or "LinkSetDb" not in links[0]:
+                return None
+
+            for link_set in links[0]["LinkSetDb"]:
+                if link_set.get("DbTo") != "gene":
+                    continue
+
+                link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0]
+                return str(link.get("Id") if isinstance(link, dict) else link)
+
+        try:
+            # TODO: support accession number with version number (e.g., NM_000546.3)
+            with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle:
+                gene_id = _extract_gene_id(link_handle)
+
+            if not gene_id:
+                self.logger.warning("Accession %s has no associated GeneID", accession)
+                return None
+
+            result = self.get_by_gene_id(gene_id, preferred_accession=accession)
+
+            if result:
+                result["id"] = accession
+                result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
+
+            return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as exc:
+            self.logger.error("Accession %s not found: %s", accession, exc)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
+        """Search NCBI Gene database with a keyword and return the best hit."""
+        if not keyword.strip():
+            return None
+
+        try:
+            for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
+                with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle:
+                    search_results = Entrez.read(search_handle)
+
+                if len(gene_id := search_results.get("IdList", [])) > 0:
+                    result = self.get_by_gene_id(gene_id[0])
+                    return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as e:
+            self.logger.error("Keyword %s not found: %s", keyword, e)
+        return None
+
+    def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]:
+        """Extract sequence from local BLAST database using blastdbcmd."""
+        try:
+            cmd = [
+                "blastdbcmd",
+                "-db", self.local_blast_db,
+                "-entry", accession,
+                "-outfmt", "%s"  # Only sequence, no header
+            ]
+            sequence = subprocess.check_output(
+                cmd,
+                text=True,
+                timeout=10,  # 10 second timeout for local extraction
+                stderr=subprocess.DEVNULL
+            ).strip()
+            return sequence if sequence else None
+        except subprocess.TimeoutExpired:
+            self.logger.warning("Timeout extracting sequence from local DB for accession %s", accession)
+            return None
+        except Exception as exc:
+            self.logger.warning("Failed to extract sequence from local DB for accession %s: %s", accession, exc)
+            return None
+
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
+        try:
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
+            cmd = [
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold),
+                "-max_target_seqs", "1",
+                "-num_threads", str(self.blast_num_threads),
+                "-outfmt", "6 sacc"  # Only accession, tab-separated
+            ]
+            self.logger.debug("Running local blastn (threads=%d): %s",
+                        self.blast_num_threads, " ".join(cmd))
+
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                self.logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+
+            os.remove(tmp_name)
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
+            self.logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if 'tmp_name' in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
+            return None
+
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+        """Search NCBI with a DNA sequence using BLAST."""
+
+        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize DNA sequence from input."""
+            if sequence.startswith(">"):
+                seq = "".join(sequence.strip().split("\n")[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None
+
+
+        def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]:
+            """Process network BLAST result and return dictionary or None."""
+            if not blast_record.alignments:
+                self.logger.info("No BLAST hits found for the given sequence.")
+                return None
+
+            best_alignment = blast_record.alignments[0]
+            best_hsp = best_alignment.hsps[0]
+            if best_hsp.expect > threshold:
+                self.logger.info("No BLAST hits below the threshold E-value.")
+                return None
+
+            hit_id = best_alignment.hit_id
+            if accession_match := re.search(r"ref\|([^|]+)", hit_id):
+                return self.get_by_accession(accession_match.group(1).split(".")[0])
+
+            # If unable to extract accession, return basic information
+            return {
+                "molecule_type": "DNA",
+                "database": "NCBI",
+                "id": hit_id,
+                "title": best_alignment.title,
+                "sequence_length": len(seq),
+                "e_value": best_hsp.expect,
+                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+            }
+
+        try:
+            if not (seq := _extract_and_normalize_sequence(sequence)):
+                self.logger.error("Empty or invalid DNA sequence provided.")
+                return None
+
+            # Try local BLAST first if enabled
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+
+                if accession:
+                    self.logger.debug("Local BLAST found accession: %s", accession)
+                    # When using local BLAST, skip sequence fetching by default (faster, fewer API calls)
+                    # Sequence is already known from the query, so we only need metadata
+                    result = self.get_by_accession(accession)
+                    return result
+
+                self.logger.info(
+                    "Local BLAST found no match for sequence. "
+                    "API fallback disabled when using local database."
+                )
+                return None
+
+            # Fall back to network BLAST only if local BLAST is not enabled
+            self.logger.debug("Falling back to NCBIWWW.qblast")
+            with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
+                result = _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
+            return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as e:
+            self.logger.error("BLAST search failed: %s", e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]:
+        """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence."""
+        if not query or not isinstance(query, str):
+            self.logger.error("Empty or non-string input.")
+            return None
+
+        query = query.strip()
+        self.logger.debug("NCBI search query: %s", query)
+
+        loop = asyncio.get_running_loop()
+
+        # Auto-detect query type and execute in thread pool
+        # All methods need lock because they all call NCBI API (rate limit: max 3 requests per second)
+        # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API
+        async def _execute_with_lock(func, *args):
+            """Execute function with lock for NCBI API calls."""
+            async with _blast_lock:
+                return await loop.run_in_executor(_get_pool(), func, *args)
+
+        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+            # FASTA sequence: always use lock (even with local BLAST, get_by_accession needs API)
+            result = await _execute_with_lock(self.get_by_fasta, query, threshold)
+        elif re.fullmatch(r"^\d+$", query):
+            # Gene ID: always use lock (network API call)
+            result = await _execute_with_lock(self.get_by_gene_id, query)
+        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+            # Accession: always use lock (network API call)
+            result = await _execute_with_lock(self.get_by_accession, query)
+        else:
+            # Keyword: always use lock (network API call)
+            result = await _execute_with_lock(self.get_best_hit, query)
+
+        if result:
+            result["_search_query"] = query
+        return result
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
new file mode 100644
index 00000000..25dbc73c
--- /dev/null
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -0,0 +1,395 @@
+import asyncio
+import os
+import re
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+import tempfile
+from typing import Dict, Optional, List, Any, Set
+
+import hashlib
+import requests
+import aiohttp
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
+
+class RNACentralSearch(BaseSearcher):
+    """
+    RNAcentral Search client to search RNA databases.
+    1) Get RNA by RNAcentral ID.
+    2) Search with keywords or RNA names (fuzzy search).
+    3) Search with RNA sequence.
+
+    API Documentation: https://rnacentral.org/api/v1
+    """
+
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "rna_db",
+        api_timeout: int = 30,
+        blast_num_threads: int = 4,
+        working_dir: str = "cache",
+    ):
+        super().__init__(working_dir=working_dir)
+        self.base_url = "https://rnacentral.org/api/v1"
+        self.headers = {"Accept": "application/json"}
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        self.api_timeout = api_timeout
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
+            self.logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
+
+    @staticmethod
+    def _rna_data_to_dict(
+        rna_id: str,
+        rna_data: Dict[str, Any],
+        xrefs_data: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, Any]:
+        organisms, gene_names, so_terms = set(), set(), set()
+        modifications: List[Any] = []
+
+        for xref in xrefs_data or []:
+            acc = xref.get("accession", {})
+            if s := acc.get("species"):
+                organisms.add(s)
+            gene_value = acc.get("gene")
+            if isinstance(gene_value, str) and (g := gene_value.strip()):
+                gene_names.add(g)
+            if m := xref.get("modifications"):
+                modifications.extend(m)
+            if b := acc.get("biotype"):
+                so_terms.add(b)
+
+        def format_unique_values(values: Set[str]) -> Optional[str]:
+            if not values:
+                return None
+            if len(values) == 1:
+                return next(iter(values))
+            return ", ".join(sorted(values))
+
+        xrefs_info = {
+            "organism": format_unique_values(organisms),
+            "gene_name": format_unique_values(gene_names),
+            "related_genes": list(gene_names) if gene_names else None,
+            "modifications": modifications or None,
+            "so_term": format_unique_values(so_terms),
+        }
+
+        fallback_rules = {
+            "organism": ["organism", "species"],
+            "related_genes": ["related_genes", "genes"],
+            "gene_name": ["gene_name", "gene"],
+            "so_term": ["so_term"],
+            "modifications": ["modifications"],
+        }
+
+        def resolve_field(field_name: str) -> Any:
+            if (value := xrefs_info.get(field_name)) is not None:
+                return value
+
+            for key in fallback_rules[field_name]:
+                if (value := rna_data.get(key)) is not None:
+                    return value
+
+            return None
+
+        organism = resolve_field("organism")
+        gene_name = resolve_field("gene_name")
+        so_term = resolve_field("so_term")
+        modifications = resolve_field("modifications")
+
+        related_genes = resolve_field("related_genes")
+        if not related_genes and (single_gene := rna_data.get("gene_name")):
+            related_genes = [single_gene]
+
+        sequence = rna_data.get("sequence", "")
+
+        return {
+            "molecule_type": "RNA",
+            "database": "RNAcentral",
+            "id": rna_id,
+            "rnacentral_id": rna_data.get("rnacentral_id", rna_id),
+            "sequence": sequence,
+            "sequence_length": rna_data.get("length", len(sequence)),
+            "rna_type": rna_data.get("rna_type", "N/A"),
+            "description": rna_data.get("description", "N/A"),
+            "url": f"https://rnacentral.org/rna/{rna_id}",
+            "organism": organism,
+            "related_genes": related_genes or None,
+            "gene_name": gene_name,
+            "so_term": so_term,
+            "modifications": modifications,
+        }
+
+    @staticmethod
+    def _calculate_md5(sequence: str) -> str:
+        """
+        Calculate MD5 hash for RNA sequence as per RNAcentral spec.
+        - Replace U with T
+        - Convert to uppercase
+        - Encode as ASCII
+        """
+        # Normalize sequence
+        normalized_seq = sequence.replace("U", "T").replace("u", "t").upper()
+        if not re.fullmatch(r"[ATCGN]+", normalized_seq):
+            raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...")
+
+        return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
+        reraise=False,
+    )
+    def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
+        """
+        Get RNA information by RNAcentral ID.
+        :param rna_id: RNAcentral ID (e.g., URS0000000001).
+        :return: A dictionary containing RNA information or None if not found.
+        """
+        try:
+            url = f"{self.base_url}/rna/{rna_id}"
+            url += "?flat=true"
+
+            resp = requests.get(url, headers=self.headers, timeout=self.api_timeout)
+            resp.raise_for_status()
+
+            rna_data = resp.json()
+            xrefs_data = rna_data.get("xrefs", [])
+            result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+            return result
+        except requests.Timeout as e:
+            self.logger.warning("Timeout getting RNA ID %s (timeout=%ds): %s", rna_id, self.api_timeout, e)
+            return None
+        except requests.RequestException as e:
+            self.logger.error("Network error getting RNA ID %s: %s", rna_id, e)
+            return None
+        except Exception as e:  # pylint: disable=broad-except
+            self.logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
+        reraise=False,
+    )
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
+        """
+        Search RNAcentral with a keyword and return the best hit.
+        :param keyword: The search keyword (e.g., miRNA name, RNA name).
+        :return: Dictionary with RNA information or None.
+        """
+        keyword = keyword.strip()
+        if not keyword:
+            self.logger.warning("Empty keyword provided to get_best_hit")
+            return None
+
+        try:
+            url = f"{self.base_url}/rna"
+            params = {"search": keyword, "format": "json"}
+            resp = requests.get(url, params=params, headers=self.headers, timeout=self.api_timeout)
+            resp.raise_for_status()
+
+            data = resp.json()
+            results = data.get("results", [])
+
+            if not results:
+                self.logger.info("No search results for keyword: %s", keyword)
+                return None
+
+            first_result = results[0]
+            rna_id = first_result.get("rnacentral_id")
+
+            if rna_id:
+                detailed = self.get_by_rna_id(rna_id)
+                if detailed:
+                    return detailed
+            self.logger.debug("Using search result data for %s", rna_id or "unknown")
+            return self._rna_data_to_dict(rna_id or "", first_result)
+
+        except requests.RequestException as e:
+            self.logger.error("Network error searching keyword '%s': %s", keyword, e)
+            return None
+        except Exception as e:
+            self.logger.error("Unexpected error searching keyword '%s': %s", keyword, e)
+            return None
+
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
+        try:
+            # Use temporary file for query sequence
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
+            cmd = [
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold),
+                "-max_target_seqs", "1",
+                "-num_threads", str(self.blast_num_threads),
+                "-outfmt", "6 sacc"  # Only accession, tab-separated
+            ]
+            self.logger.debug("Running local blastn for RNA (threads=%d): %s",
+                        self.blast_num_threads, " ".join(cmd))
+
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                self.logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+
+            os.remove(tmp_name)
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
+            self.logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if 'tmp_name' in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
+            return None
+
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+        """
+        Search RNAcentral with an RNA sequence.
+        Tries local BLAST first if enabled, falls back to RNAcentral API.
+        Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
+        :param sequence: RNA sequence (FASTA format or raw sequence).
+        :param threshold: E-value threshold for BLAST search.
+        :return: A dictionary containing complete RNA information or None if not found.
+        """
+        def _extract_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize RNA sequence from input."""
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            # Accept both U (original RNA) and T
+            return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None
+
+        try:
+            seq = _extract_sequence(sequence)
+            if not seq:
+                self.logger.error("Empty or invalid RNA sequence provided.")
+                return None
+
+            # Try local BLAST first if enabled
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    self.logger.debug("Local BLAST found accession: %s", accession)
+                    detailed = self.get_by_rna_id(accession)
+                    if detailed:
+                        return detailed
+                    self.logger.info(
+                        "Local BLAST found accession %s but could not retrieve metadata from API.",
+                        accession
+                    )
+                    return None
+                self.logger.info(
+                    "Local BLAST found no match for sequence. "
+                    "API fallback disabled when using local database."
+                )
+                return None
+
+            # Fall back to RNAcentral API only if local BLAST is not enabled
+            self.logger.debug("Falling back to RNAcentral API.")
+
+            md5_hash = self._calculate_md5(seq)
+            search_url = f"{self.base_url}/rna"
+            params = {"md5": md5_hash, "format": "json"}
+
+            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)
+            resp.raise_for_status()
+
+            search_results = resp.json()
+            results = search_results.get("results", [])
+
+            if not results:
+                self.logger.info("No exact match found in RNAcentral for sequence")
+                return None
+
+            rna_id = results[0].get("rnacentral_id")
+            if rna_id:
+                detailed = self.get_by_rna_id(rna_id)
+                if detailed:
+                    return detailed
+                # Fallback: use search result data if get_by_rna_id returns None
+                self.logger.debug("Using search result data for %s (get_by_rna_id returned None)", rna_id)
+                return self._rna_data_to_dict(rna_id, results[0])
+
+            self.logger.error("No RNAcentral ID found in search results.")
+            return None
+        except Exception as e:
+            self.logger.error("Sequence search failed: %s", e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
+        reraise=True,
+    )
+    async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]:
+        """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence."""
+        if not query or not isinstance(query, str):
+            self.logger.error("Empty or non-string input.")
+            return None
+
+        query = query.strip()
+        self.logger.debug("RNAcentral search query: %s", query)
+
+        loop = asyncio.get_running_loop()
+
+        # check if RNA sequence (AUCG or ATCG characters, contains U or T)
+        # Note: Sequences with T are also RNA sequences
+        is_rna_sequence = query.startswith(">") or (
+            re.fullmatch(r"[AUCGTN\s]+", query, re.I) and
+            ("U" in query.upper() or "T" in query.upper())
+        )
+        if is_rna_sequence:
+            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+        # check if RNAcentral ID (typically starts with URS)
+        elif re.fullmatch(r"URS\d+", query, re.I):
+            result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query)
+        else:
+            # otherwise treat as keyword
+            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+
+        if result:
+            result["_search_query"] = query
+        return result
diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py
new file mode 100644
index 00000000..32115e34
--- /dev/null
+++ b/graphgen/models/searcher/db/uniprot_searcher.py
@@ -0,0 +1,298 @@
+import asyncio
+import os
+import re
+import subprocess
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from io import StringIO
+from typing import Dict, Optional
+
+from Bio import ExPASy, SeqIO, SwissProt, UniProt
+from Bio.Blast import NCBIWWW, NCBIXML
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
+
+
+# ensure only one BLAST searcher at a time
+_blast_lock = asyncio.Lock()
+
+
+class UniProtSearch(BaseSearcher):
+    """
+    UniProt Search client to searcher with UniProt.
+    1) Get the protein by accession number.
+    2) Search with keywords or protein names (fuzzy searcher).
+    3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
+    """
+
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "sp_db",
+        blast_num_threads: int = 4,
+        working_dir: str = "cache",
+    ):
+        super().__init__(working_dir=working_dir)
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
+            self.logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
+
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        try:
+            handle = ExPASy.get_sprot_raw(accession)
+            record = SwissProt.read(handle)
+            handle.close()
+            return self._swissprot_to_dict(record)
+        except RequestException:  # network-related errors
+            raise
+        except Exception as exc:  # pylint: disable=broad-except
+            self.logger.error("Accession %s not found: %s", accession, exc)
+            return None
+
+    @staticmethod
+    def _swissprot_to_dict(record: SwissProt.Record) -> dict:
+        """error
+        Convert a SwissProt.Record to a dictionary.
+        """
+        functions = []
+        for line in record.comments:
+            if line.startswith("FUNCTION:"):
+                functions.append(line[9:].strip())
+
+        return {
+            "molecule_type": "protein",
+            "database": "UniProt",
+            "id": record.accessions[0],
+            "entry_name": record.entry_name,
+            "gene_names": record.gene_name,
+            "protein_name": record.description.split(";")[0].split("=")[-1],
+            "organism": record.organism.split(" (")[0],
+            "sequence": str(record.sequence),
+            "function": functions,
+            "url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}",
+        }
+
+    def get_best_hit(self, keyword: str) -> Optional[Dict]:
+        """
+        Search UniProt with a keyword and return the best hit.
+        :param keyword: The searcher keyword.
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        if not keyword.strip():
+            return None
+
+        try:
+            iterator = UniProt.search(keyword, fields=None, batch_size=1)
+            hit = next(iterator, None)
+            if hit is None:
+                return None
+            return self.get_by_accession(hit["primaryAccession"])
+
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            self.logger.error("Keyword %s not found: %s", keyword, e)
+        return None
+
+    def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
+        """
+        Search UniProt with a FASTA sequence and return the best hit.
+        :param fasta_sequence: The FASTA sequence.
+        :param threshold: E-value threshold for BLAST searcher.
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        try:
+            if fasta_sequence.startswith(">"):
+                seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq)
+            else:
+                seq = fasta_sequence.strip()
+        except Exception as e:  # pylint: disable=broad-except
+            self.logger.error("Invalid FASTA sequence: %s", e)
+            return None
+
+        if not seq:
+            self.logger.error("Empty FASTA sequence provided.")
+            return None
+
+        if self.use_local_blast:
+            accession = self._local_blast(seq, threshold)
+            if accession:
+                self.logger.debug("Local BLAST found accession: %s", accession)
+                return self.get_by_accession(accession)
+            self.logger.info(
+                "Local BLAST found no match for sequence. "
+                "API fallback disabled when using local database."
+            )
+            return None
+
+        # Fall back to network BLAST only if local BLAST is not enabled
+        self.logger.debug("Falling back to NCBIWWW.qblast.")
+
+        # UniProtKB/Swiss-Prot BLAST API
+        try:
+            self.logger.debug(
+                "Performing BLAST searcher for the given sequence: %s", seq
+            )
+            result_handle = NCBIWWW.qblast(
+                program="blastp",
+                database="swissprot",
+                sequence=seq,
+                hitlist_size=1,
+                expect=threshold,
+            )
+            blast_record = NCBIXML.read(result_handle)
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            self.logger.error("BLAST searcher failed: %s", e)
+            return None
+
+        if not blast_record.alignments:
+            self.logger.info("No BLAST hits found for the given sequence.")
+            return None
+
+        best_alignment = blast_record.alignments[0]
+        best_hsp = best_alignment.hsps[0]
+        if best_hsp.expect > threshold:
+            self.logger.info("No BLAST hits below the threshold E-value.")
+            return None
+
+        # like sp|P01308.1|INS_HUMAN
+        hit_id = best_alignment.hit_id
+        accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
+        return self.get_by_accession(accession)
+
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        :param seq: The protein sequence.
+        :param threshold: E-value threshold for BLAST searcher.
+        :return: The accession number of the best hit or None if not found.
+        """
+        try:
+            with tempfile.NamedTemporaryFile(
+                mode="w+", suffix=".fa", delete=False
+            ) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
+            cmd = [
+                "blastp",
+                "-db",
+                self.local_blast_db,
+                "-query",
+                tmp_name,
+                "-evalue",
+                str(threshold),
+                "-max_target_seqs",
+                "1",
+                "-num_threads",
+                str(self.blast_num_threads),
+                "-outfmt",
+                "6 sacc",  # Only accession, tab-separated
+            ]
+            self.logger.debug("Running local blastp (threads=%d): %s",
+                        self.blast_num_threads, " ".join(cmd))
+
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd,
+                    text=True,
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                self.logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+
+            os.remove(tmp_name)
+            if out:
+                return out.split("\n", maxsplit=1)[0]
+            return None
+        except Exception as exc:  # pylint: disable=broad-except
+            self.logger.error("Local blastp failed: %s", exc)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(RequestException),
+        reraise=True,
+    )
+    async def search(
+        self, query: str, threshold: float = 0.7, **kwargs
+    ) -> Optional[Dict]:
+        """
+        Search UniProt with either an accession number, keyword, or FASTA sequence.
+        :param query: The searcher query (accession number, keyword, or FASTA sequence).
+        :param threshold: E-value threshold for BLAST searcher.
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+
+        # auto detect query type
+        if not query or not isinstance(query, str):
+            self.logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+
+        self.logger.debug("UniProt searcher query: %s", query)
+
+        loop = asyncio.get_running_loop()
+
+        # check if fasta sequence
+        if query.startswith(">") or re.fullmatch(
+            r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
+        ):
+            # Only use lock for network BLAST (NCBIWWW), local BLAST can run in parallel
+            if self.use_local_blast:
+                # Local BLAST can run in parallel, no lock needed
+                result = await loop.run_in_executor(
+                    _get_pool(), self.get_by_fasta, query, threshold
+                )
+            else:
+                # Network BLAST needs lock to respect rate limits
+                async with _blast_lock:
+                    result = await loop.run_in_executor(
+                        _get_pool(), self.get_by_fasta, query, threshold
+                    )
+
+        # check if accession number
+        # UniProt accession IDs: 6-10 characters, must start with a letter
+        # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric)
+        elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I):
+            result = await loop.run_in_executor(
+                _get_pool(), self.get_by_accession, query
+            )
+
+        else:
+            # otherwise treat as keyword
+            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+
+        if result:
+            result["_search_query"] = query
+        return result
diff --git a/graphgen/models/searcher/kg/__init__.py b/graphgen/models/searcher/kg/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/models/search/wiki_search.py b/graphgen/models/searcher/kg/wiki_search.py
similarity index 87%
rename from graphgen/models/search/wiki_search.py
rename to graphgen/models/searcher/kg/wiki_search.py
index db312a2b..2d8686cf 100644
--- a/graphgen/models/search/wiki_search.py
+++ b/graphgen/models/searcher/kg/wiki_search.py
@@ -1,21 +1,20 @@
 from typing import List, Union
-from dataclasses import dataclass
 
 import wikipedia
 from wikipedia import set_lang
+
 from graphgen.utils import detect_main_language, logger
 
 
-@dataclass
 class WikiSearch:
     @staticmethod
     def set_language(language: str):
         assert language in ["en", "zh"], "Only support English and Chinese"
         set_lang(language)
 
-    async def search(self, query: str) -> Union[List[str], None]:
+    async def search(self, query: str, num_results: int = 1) -> Union[List[str], None]:
         self.set_language(detect_main_language(query))
-        return wikipedia.search(query)
+        return wikipedia.search(query, results=num_results, suggestion=False)
 
     async def summary(self, query: str) -> Union[str, None]:
         self.set_language(detect_main_language(query))
diff --git a/graphgen/models/searcher/web/__init__.py b/graphgen/models/searcher/web/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/models/searcher/web/bing_search.py b/graphgen/models/searcher/web/bing_search.py
new file mode 100644
index 00000000..77ae2110
--- /dev/null
+++ b/graphgen/models/searcher/web/bing_search.py
@@ -0,0 +1,41 @@
+import requests
+from fastapi import HTTPException
+
+from graphgen.utils import logger
+
+BING_SEARCH_V7_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
+BING_MKT = "en-US"
+
+
+class BingSearch:
+    """
+    Bing Search client to searcher with Bing.
+    """
+
+    def __init__(self, subscription_key: str):
+        self.subscription_key = subscription_key
+
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Bing and return the contexts.
+        :param query: The searcher query.
+        :param num_results: The number of results to return.
+        :return: A list of searcher results.
+        """
+        params = {"q": query, "mkt": BING_MKT, "count": num_results}
+        response = requests.get(
+            BING_SEARCH_V7_ENDPOINT,
+            headers={"Ocp-Apim-Subscription-Key": self.subscription_key},
+            params=params,
+            timeout=10,
+        )
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["webPages"]["value"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts
diff --git a/graphgen/models/searcher/web/google_search.py b/graphgen/models/searcher/web/google_search.py
new file mode 100644
index 00000000..0d598f3a
--- /dev/null
+++ b/graphgen/models/searcher/web/google_search.py
@@ -0,0 +1,42 @@
+import requests
+from fastapi import HTTPException
+
+from graphgen.utils import logger
+
+GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
+
+
+class GoogleSearch:
+    def __init__(self, subscription_key: str, cx: str):
+        """
+        Initialize the Google Search client with the subscription key and custom searcher engine ID.
+        :param subscription_key: Your Google API subscription key.
+        :param cx: Your custom searcher engine ID.
+        """
+        self.subscription_key = subscription_key
+        self.cx = cx
+
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Google and return the contexts.
+        :param query: The searcher query.
+        :param num_results: The number of results to return.
+        :return: A list of searcher results.
+        """
+        params = {
+            "key": self.subscription_key,
+            "cx": self.cx,
+            "q": query,
+            "num": num_results,
+        }
+        response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=params, timeout=10)
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["items"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts
diff --git a/graphgen/models/splitter/__init__.py b/graphgen/models/splitter/__init__.py
new file mode 100644
index 00000000..4f8a427c
--- /dev/null
+++ b/graphgen/models/splitter/__init__.py
@@ -0,0 +1,4 @@
+from .recursive_character_splitter import (
+    ChineseRecursiveTextSplitter,
+    RecursiveCharacterSplitter,
+)
diff --git a/graphgen/models/splitter/character_splitter.py b/graphgen/models/splitter/character_splitter.py
new file mode 100644
index 00000000..8877c861
--- /dev/null
+++ b/graphgen/models/splitter/character_splitter.py
@@ -0,0 +1,26 @@
+import re
+from typing import Any, List
+
+from graphgen.bases.base_splitter import BaseSplitter
+
+
+class CharacterSplitter(BaseSplitter):
+    """Splitting text that looks at characters."""
+
+    def __init__(
+        self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(**kwargs)
+        self._separator = separator
+        self._is_separator_regex = is_separator_regex
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively chunk the large input into a bunch of smaller ones.
+        separator = (
+            self._separator if self._is_separator_regex else re.escape(self._separator)
+        )
+        splits = self._split_text_with_regex(text, separator, self.keep_separator)
+        _separator = "" if self.keep_separator else self._separator
+        return self._merge_splits(splits, _separator)
diff --git a/graphgen/models/splitter/markdown_splitter.py b/graphgen/models/splitter/markdown_splitter.py
new file mode 100644
index 00000000..40b6a44e
--- /dev/null
+++ b/graphgen/models/splitter/markdown_splitter.py
@@ -0,0 +1,33 @@
+from typing import Any
+
+from graphgen.models.splitter.recursive_character_splitter import (
+    RecursiveCharacterSplitter,
+)
+
+
+class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
+    """Attempts to chunk the text along Markdown-formatted headings."""
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize a MarkdownTextRefSplitter."""
+        separators = [
+            # First, try to chunk along Markdown headings (starting with level 2)
+            "\n#{1,6} ",
+            # Note the alternative syntax for headings (below) is not handled here
+            # Heading level 2
+            # ---------------
+            # End of code block
+            "```\n",
+            # Horizontal lines
+            "\n\\*\\*\\*+\n",
+            "\n---+\n",
+            "\n___+\n",
+            # Note: horizontal lines defined by three or more of ***, ---, or ___
+            # are handled by the regexes above, but alternative syntaxes (e.g., with spaces)
+            # are not handled.
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)
diff --git a/graphgen/models/splitter/recursive_character_splitter.py b/graphgen/models/splitter/recursive_character_splitter.py
new file mode 100644
index 00000000..b1ee8e06
--- /dev/null
+++ b/graphgen/models/splitter/recursive_character_splitter.py
@@ -0,0 +1,149 @@
+import re
+from typing import Any, List, Optional
+
+from graphgen.bases.base_splitter import BaseSplitter
+
+
+class RecursiveCharacterSplitter(BaseSplitter):
+    """Splitting text by recursively look at characters.
+
+    Recursively tries to chunk by different characters to find one that works.
+    """
+
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or ["\n\n", "\n", " ", ""]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex(text, _separator, self.keep_separator)
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return final_chunks
+
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, self._separators)
+
+
+class ChineseRecursiveTextSplitter(RecursiveCharacterSplitter):
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            "。|！|？",
+            r"\.\s|\!\s|\?\s",
+            r"；|;\s",
+            r"，|,\s",
+        ]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text_with_regex_from_end(
+        self, text: str, separator: str, keep_separator: bool
+    ) -> List[str]:
+        # Now that we have the separator, chunk the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+                if len(_splits) % 2 == 1:
+                    splits += _splits[-1:]
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex_from_end(
+            text, _separator, self.keep_separator
+        )
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return [
+            re.sub(r"\n{2,}", "\n", chunk.strip())
+            for chunk in final_chunks
+            if chunk.strip() != ""
+        ]
diff --git a/graphgen/models/splitter/sequence_splitter.py b/graphgen/models/splitter/sequence_splitter.py
new file mode 100644
index 00000000..e0c62bbc
--- /dev/null
+++ b/graphgen/models/splitter/sequence_splitter.py
@@ -0,0 +1,119 @@
+import re
+from typing import List, Optional
+
+from graphgen.bases.base_splitter import BaseSplitter
+from graphgen.utils.log import logger
+
+
+class SequenceSplitter(BaseSplitter):
+    """
+    Splitter for biological sequences (DNA, RNA, protein).
+    Supports chunking by fixed length with overlap.
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 100,
+        length_function=None,
+        **kwargs,
+    ):
+        """
+        Initialize sequence splitter.
+
+        :param chunk_size: Maximum length of each chunk (in sequence characters)
+        :param chunk_overlap: Number of characters to overlap between chunks
+        :param length_function: Function to calculate length (default: len)
+        """
+        if length_function is None:
+            length_function = len
+        super().__init__(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=length_function,
+            **kwargs,
+        )
+
+    def split_text(self, text: str) -> List[str]:
+        """
+        Split a sequence into chunks of fixed size with overlap.
+
+        :param text: The sequence to split (may include FASTA header)
+        :return: List of sequence chunks
+        """
+        # Remove FASTA header if present
+        sequence = self._extract_sequence(text)
+
+        if not sequence:
+            logger.warning("Empty sequence provided to SequenceSplitter")
+            return []
+
+        # If sequence is shorter than chunk_size, return as single chunk
+        if len(sequence) <= self.chunk_size:
+            return [sequence]
+
+        chunks = []
+        start = 0
+        step = self.chunk_size - self.chunk_overlap
+
+        while start < len(sequence):
+            end = min(start + self.chunk_size, len(sequence))
+            chunk = sequence[start:end]
+            chunks.append(chunk)
+            start += step
+
+            # Avoid infinite loop if step is 0 or negative
+            if step <= 0:
+                break
+
+        return chunks
+
+    @staticmethod
+    def _extract_sequence(text: str) -> str:
+        """
+        Extract sequence from text, removing FASTA headers and whitespace.
+
+        :param text: Input text (may contain FASTA header)
+        :return: Clean sequence string
+        """
+        # Remove FASTA header lines (lines starting with >)
+        lines = text.split("\n")
+        sequence_lines = [line for line in lines if not line.strip().startswith(">")]
+
+        # Join and remove whitespace
+        sequence = "".join(sequence_lines)
+        sequence = re.sub(r"\s+", "", sequence)
+
+        return sequence.upper()  # Normalize to uppercase
+
+    @staticmethod
+    def detect_sequence_type(sequence: str) -> Optional[str]:
+        """
+        Detect the type of sequence (DNA, RNA, or protein).
+
+        :param sequence: The sequence string
+        :return: "dna", "rna", "protein", or None if cannot determine
+        """
+        # Remove FASTA header and whitespace
+        clean_seq = SequenceSplitter._extract_sequence(sequence)
+
+        if not clean_seq:
+            return None
+
+        # Check for protein-specific amino acids
+        protein_chars = set("EFILPQXZ")  # Amino acids not in DNA/RNA
+        if any(char in clean_seq for char in protein_chars):
+            return "protein"
+
+        # Check for RNA-specific character (U)
+        if "U" in clean_seq.upper():
+            return "rna"
+
+        # Check if contains only DNA/RNA characters (A, T, G, C, N)
+        dna_rna_chars = set("ATGCUN")
+        if all(char.upper() in dna_rna_chars for char in clean_seq):
+            # Default to DNA if ambiguous (could be DNA or RNA without U)
+            return "dna"
+
+        # If contains other characters, might be protein
+        return "protein"
diff --git a/graphgen/models/storage/__init__.py b/graphgen/models/storage/__init__.py
index e69de29b..889a074c 100644
--- a/graphgen/models/storage/__init__.py
+++ b/graphgen/models/storage/__init__.py
@@ -0,0 +1,6 @@
+from graphgen.models.storage.graph.kuzu_storage import KuzuStorage
+from graphgen.models.storage.graph.networkx_storage import NetworkXStorage
+from graphgen.models.storage.kv.json_storage import JsonKVStorage
+from graphgen.models.storage.kv.rocksdb_storage import RocksDBKVStorage
+
+from .rocksdb_cache import RocksDBCache
diff --git a/graphgen/models/storage/base_storage.py b/graphgen/models/storage/base_storage.py
deleted file mode 100644
index 2e70a3cb..00000000
--- a/graphgen/models/storage/base_storage.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from dataclasses import dataclass
-from typing import Union, Generic, TypeVar
-from graphgen.models.embed.embedding import EmbeddingFunc
-
-T = TypeVar("T")
-
-@dataclass
-class StorageNameSpace:
-    working_dir: str = None
-    namespace: str = None
-
-    async def index_done_callback(self):
-        """commit the storage operations after indexing"""
-
-    async def query_done_callback(self):
-        """commit the storage operations after querying"""
-
-
-@dataclass
-class BaseKVStorage(Generic[T], StorageNameSpace):
-    embedding_func: EmbeddingFunc = None
-
-    async def all_keys(self) -> list[str]:
-        raise NotImplementedError
-
-    async def get_by_id(self, id: str) -> Union[T, None]:
-        raise NotImplementedError
-
-    async def get_by_ids(
-        self, ids: list[str], fields: Union[set[str], None] = None
-    ) -> list[Union[T, None]]:
-        raise NotImplementedError
-
-    async def filter_keys(self, data: list[str]) -> set[str]:
-        """return un-exist keys"""
-        raise NotImplementedError
-
-    async def upsert(self, data: dict[str, T]):
-        raise NotImplementedError
-
-    async def drop(self):
-        raise NotImplementedError
-
-@dataclass
-class BaseGraphStorage(StorageNameSpace):
-    embedding_func: EmbeddingFunc = None
-
-    async def has_node(self, node_id: str) -> bool:
-        raise NotImplementedError
-
-    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        raise NotImplementedError
-
-    async def node_degree(self, node_id: str) -> int:
-        raise NotImplementedError
-
-    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        raise NotImplementedError
-
-    async def get_node(self, node_id: str) -> Union[dict, None]:
-        raise NotImplementedError
-
-    async def update_node(self, node_id: str, node_data: dict[str, str]):
-        raise NotImplementedError
-
-    async def get_all_nodes(self) -> Union[list[dict], None]:
-        raise NotImplementedError
-
-    async def get_edge(
-        self, source_node_id: str, target_node_id: str
-    ) -> Union[dict, None]:
-        raise NotImplementedError
-
-    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
-        raise NotImplementedError
-
-    async def get_all_edges(self) -> Union[list[dict], None]:
-        raise NotImplementedError
-
-    async def get_node_edges(
-        self, source_node_id: str
-    ) -> Union[list[tuple[str, str]], None]:
-        raise NotImplementedError
-
-    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
-        raise NotImplementedError
-
-    async def upsert_edge(
-        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
-    ):
-        raise NotImplementedError
-
-    async def delete_node(self, node_id: str):
-        raise NotImplementedError
diff --git a/graphgen/models/storage/graph/__init__.py b/graphgen/models/storage/graph/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/models/storage/graph/kuzu_storage.py b/graphgen/models/storage/graph/kuzu_storage.py
new file mode 100644
index 00000000..4a221b8e
--- /dev/null
+++ b/graphgen/models/storage/graph/kuzu_storage.py
@@ -0,0 +1,256 @@
+import json
+import os
+import shutil
+from dataclasses import dataclass
+from typing import Any
+
+try:
+    import kuzu
+except ImportError:
+    kuzu = None
+
+from graphgen.bases.base_storage import BaseGraphStorage
+
+
+@dataclass
+class KuzuStorage(BaseGraphStorage):
+    """
+    Graph storage implementation based on KuzuDB.
+    Since KuzuDB is a structured graph database and GraphGen uses dynamic dictionaries for properties,
+    we map the data to a generic schema:
+    - Node Table 'Entity': {id: STRING, data: STRING (JSON)}
+    - Rel Table 'Relation': {FROM Entity TO Entity, data: STRING (JSON)}
+    """
+
+    working_dir: str = None
+    namespace: str = None
+    _db: Any = None
+    _conn: Any = None
+
+    def __post_init__(self):
+        if kuzu is None:
+            raise ImportError(
+                "KuzuDB is not installed. Please install it via `pip install kuzu`."
+            )
+
+        self.db_path = os.path.join(self.working_dir, f"{self.namespace}_kuzu")
+        self._init_db()
+
+    def _init_db(self):
+        # KuzuDB automatically creates the directory
+        self._db = kuzu.Database(self.db_path)
+        self._conn = kuzu.Connection(self._db)
+        self._init_schema()
+        print(f"KuzuDB initialized at {self.db_path}")
+
+    def _init_schema(self):
+        """Initialize the generic Node and Edge tables if they don't exist."""
+        # Check and create Node table
+        try:
+            # We use a generic table name "Entity" to store all nodes
+            self._conn.execute(
+                "CREATE NODE TABLE Entity(id STRING, data STRING, PRIMARY KEY(id))"
+            )
+            print("Created KuzuDB Node Table 'Entity'")
+        except RuntimeError as e:
+            # Usually throws if table exists, verify safely or ignore
+            print("Node Table 'Entity' already exists or error:", e)
+
+        # Check and create Edge table
+        try:
+            # We use a generic table name "Relation" to store all edges
+            self._conn.execute(
+                "CREATE REL TABLE Relation(FROM Entity TO Entity, data STRING)"
+            )
+            print("Created KuzuDB Rel Table 'Relation'")
+        except RuntimeError as e:
+            print("Rel Table 'Relation' already exists or error:", e)
+
+    def index_done_callback(self):
+        """KuzuDB is ACID, changes are immediate, but we can verify generic persistence here."""
+
+    def has_node(self, node_id: str) -> bool:
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $id}) RETURN count(a)", {"id": node_id}
+        )
+        count = result.get_next()[0]
+        return count > 0
+
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) RETURN count(e)",
+            {"src": source_node_id, "dst": target_node_id},
+        )
+        count = result.get_next()[0]
+        return count > 0
+
+    def node_degree(self, node_id: str) -> int:
+        # Calculate total degree (incoming + outgoing)
+        query = """
+            MATCH (a:Entity {id: $id})-[e:Relation]-(b:Entity)
+            RETURN count(e)
+        """
+        result = self._conn.execute(query, {"id": node_id})
+        if result.has_next():
+            return result.get_next()[0]
+        return 0
+
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        # In this context, usually checks existence or multiplicity.
+        # Kuzu supports multi-edges, so we count them.
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            RETURN count(e)
+        """
+        result = self._conn.execute(query, {"src": src_id, "dst": tgt_id})
+        if result.has_next():
+            return result.get_next()[0]
+        return 0
+
+    def get_node(self, node_id: str) -> Any:
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $id}) RETURN a.data", {"id": node_id}
+        )
+        if result.has_next():
+            data_str = result.get_next()[0]
+            return json.loads(data_str) if data_str else {}
+        return None
+
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        current_data = self.get_node(node_id)
+        if current_data is None:
+            print(f"Node {node_id} not found for update.")
+            return
+
+        # Merge existing data with new data
+        current_data.update(node_data)
+        json_data = json.dumps(current_data, ensure_ascii=False)
+
+        self._conn.execute(
+            "MATCH (a:Entity {id: $id}) SET a.data = $data",
+            {"id": node_id, "data": json_data},
+        )
+
+    def get_all_nodes(self) -> Any:
+        """Returns List[Tuple[id, data_dict]]"""
+        result = self._conn.execute("MATCH (a:Entity) RETURN a.id, a.data")
+        nodes = []
+        while result.has_next():
+            row = result.get_next()
+            nodes.append((row[0], json.loads(row[1])))
+        return nodes
+
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        # Warning: If multiple edges exist, this returns the first one found
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            RETURN e.data
+        """
+        result = self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id}
+        )
+        if result.has_next():
+            data_str = result.get_next()[0]
+            return json.loads(data_str) if data_str else {}
+        return None
+
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        current_data = self.get_edge(source_node_id, target_node_id)
+        if current_data is None:
+            print(f"Edge {source_node_id}->{target_node_id} not found for update.")
+            return
+
+        current_data.update(edge_data)
+        json_data = json.dumps(current_data, ensure_ascii=False)
+
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            SET e.data = $data
+        """
+        self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id, "data": json_data}
+        )
+
+    def get_all_edges(self) -> Any:
+        """Returns List[Tuple[src, dst, data_dict]]"""
+        query = "MATCH (a:Entity)-[e:Relation]->(b:Entity) RETURN a.id, b.id, e.data"
+        result = self._conn.execute(query)
+        edges = []
+        while result.has_next():
+            row = result.get_next()
+            edges.append((row[0], row[1], json.loads(row[2])))
+        return edges
+
+    def get_node_edges(self, source_node_id: str) -> Any:
+        """Returns generic edges connected to this node (outgoing)"""
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity)
+            RETURN a.id, b.id, e.data
+        """
+        result = self._conn.execute(query, {"src": source_node_id})
+        edges = []
+        while result.has_next():
+            row = result.get_next()
+            edges.append((row[0], row[1], json.loads(row[2])))
+        return edges
+
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        """
+        Insert or Update node.
+        Kuzu supports MERGE clause (similar to Neo4j) to handle upserts.
+        """
+        json_data = json.dumps(node_data, ensure_ascii=False)
+        query = """
+            MERGE (a:Entity {id: $id})
+            ON MATCH SET a.data = $data
+            ON CREATE SET a.data = $data
+        """
+        self._conn.execute(query, {"id": node_id, "data": json_data})
+
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        """
+        Insert or Update edge.
+        Note: We explicitly ensure nodes exist before merging the edge to avoid errors,
+        although GraphGen generally creates nodes before edges.
+        """
+        # Ensure source node exists
+        if not self.has_node(source_node_id):
+            self.upsert_node(source_node_id, {})
+        # Ensure target node exists
+        if not self.has_node(target_node_id):
+            self.upsert_node(target_node_id, {})
+
+        json_data = json.dumps(edge_data, ensure_ascii=False)
+        query = """
+            MATCH (a:Entity {id: $src}), (b:Entity {id: $dst})
+            MERGE (a)-[e:Relation]->(b)
+            ON MATCH SET e.data = $data
+            ON CREATE SET e.data = $data
+        """
+        self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id, "data": json_data}
+        )
+
+    def delete_node(self, node_id: str):
+        # DETACH DELETE removes the node and all connected edges
+        query = "MATCH (a:Entity {id: $id}) DETACH DELETE a"
+        self._conn.execute(query, {"id": node_id})
+        print(f"Node {node_id} deleted from KuzuDB.")
+
+    def clear(self):
+        """Clear all data but keep schema (or drop tables)."""
+        self._conn.execute("MATCH (n) DETACH DELETE n")
+        print(f"Graph {self.namespace} cleared.")
+
+    def reload(self):
+        """For databases that need reloading, KuzuDB auto-manages this."""
+
+    def drop(self):
+        """Completely remove the database folder."""
+        if self.db_path and os.path.exists(self.db_path):
+            shutil.rmtree(self.db_path)
+            print(f"Dropped KuzuDB at {self.db_path}")
diff --git a/graphgen/models/storage/networkx_storage.py b/graphgen/models/storage/graph/networkx_storage.py
similarity index 63%
rename from graphgen/models/storage/networkx_storage.py
rename to graphgen/models/storage/graph/networkx_storage.py
index 92643760..7fb73b79 100644
--- a/graphgen/models/storage/networkx_storage.py
+++ b/graphgen/models/storage/graph/networkx_storage.py
@@ -1,11 +1,12 @@
-import os
 import html
-from typing import Any, Union, cast, Optional
+import os
 from dataclasses import dataclass
+from typing import Any, Optional, Union, cast
+
 import networkx as nx
 
-from graphgen.utils import logger
-from .base_storage import BaseGraphStorage
+from graphgen.bases.base_storage import BaseGraphStorage
+
 
 @dataclass
 class NetworkXStorage(BaseGraphStorage):
@@ -17,7 +18,6 @@ def load_nx_graph(file_name) -> Optional[nx.Graph]:
 
     @staticmethod
     def write_nx_graph(graph: nx.Graph, file_name):
-        logger.info("Writing graph with %d nodes, %d edges", graph.number_of_nodes(), graph.number_of_edges())
         nx.write_graphml(graph, file_name)
 
     @staticmethod
@@ -69,77 +69,81 @@ def _get_edge_key(source: Any, target: Any) -> str:
 
     def __post_init__(self):
         """
-        如果图文件存在，则加载图文件，否则创建一个新图
+        Initialize the NetworkX graph storage by loading an existing graph from a GraphML file,
+        if it exists, or creating a new empty graph otherwise.
         """
         self._graphml_xml_file = os.path.join(
             self.working_dir, f"{self.namespace}.graphml"
         )
         preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
-        if preloaded_graph is not None:
-            logger.info(
-                "Loaded graph from %s with %d nodes, %d edges", self._graphml_xml_file,
-                preloaded_graph.number_of_nodes(), preloaded_graph.number_of_edges()
+        if preloaded_graph:
+            print(
+                f"Loaded graph from {self._graphml_xml_file} with "
+                f"{preloaded_graph.number_of_nodes()} nodes, "
+                f"{preloaded_graph.number_of_edges()} edges"
             )
         self._graph = preloaded_graph or nx.Graph()
 
-    async def index_done_callback(self):
+    def index_done_callback(self):
         NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file)
 
-    async def has_node(self, node_id: str) -> bool:
+    def has_node(self, node_id: str) -> bool:
         return self._graph.has_node(node_id)
 
-    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+    def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
         return self._graph.has_edge(source_node_id, target_node_id)
 
-    async def get_node(self, node_id: str) -> Union[dict, None]:
+    def get_node(self, node_id: str) -> Union[dict, None]:
         return self._graph.nodes.get(node_id)
 
-    async def get_all_nodes(self) -> Union[list[dict], None]:
-        return self._graph.nodes(data=True)
+    def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]:
+        return list(self._graph.nodes(data=True))
 
-    async def node_degree(self, node_id: str) -> int:
-        return self._graph.degree(node_id)
+    def node_degree(self, node_id: str) -> int:
+        return int(self._graph.degree[node_id])
 
-    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        return self._graph.degree(src_id) + self._graph.degree(tgt_id)
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return int(self._graph.degree[src_id] + self._graph.degree[tgt_id])
 
-    async def get_edge(
-        self, source_node_id: str, target_node_id: str
-    ) -> Union[dict, None]:
+    def get_edge(self, source_node_id: str, target_node_id: str) -> Union[dict, None]:
         return self._graph.edges.get((source_node_id, target_node_id))
 
-    async def get_all_edges(self) -> Union[list[dict], None]:
-        return self._graph.edges(data=True)
+    def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]:
+        return list(self._graph.edges(data=True))
 
-    async def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
+    def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
         if self._graph.has_node(source_node_id):
             return list(self._graph.edges(source_node_id, data=True))
         return None
 
-    async def get_graph(self) -> nx.Graph:
+    def get_graph(self) -> nx.Graph:
         return self._graph
 
-    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
         self._graph.add_node(node_id, **node_data)
 
-    async def update_node(self, node_id: str, node_data: dict[str, str]):
+    def update_node(self, node_id: str, node_data: dict[str, str]):
         if self._graph.has_node(node_id):
             self._graph.nodes[node_id].update(node_data)
         else:
-            logger.warning("Node %s not found in the graph for update.", node_id)
+            print(f"Node {node_id} not found in the graph for update.")
 
-    async def upsert_edge(
+    def upsert_edge(
         self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
     ):
         self._graph.add_edge(source_node_id, target_node_id, **edge_data)
 
-    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
         if self._graph.has_edge(source_node_id, target_node_id):
             self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
         else:
-            logger.warning("Edge %s -> %s not found in the graph for update.", source_node_id, target_node_id)
+            print(
+                f"Edge {source_node_id} -> {target_node_id} not found in the graph for update."
+            )
 
-    async def delete_node(self, node_id: str):
+    def delete_node(self, node_id: str):
         """
         Delete a node from the graph based on the specified node_id.
 
@@ -147,13 +151,19 @@ async def delete_node(self, node_id: str):
         """
         if self._graph.has_node(node_id):
             self._graph.remove_node(node_id)
-            logger.info("Node %s deleted from the graph.", node_id)
+            print(f"Node {node_id} deleted from the graph.")
         else:
-            logger.warning("Node %s not found in the graph for deletion.", node_id)
+            print(f"Node {node_id} not found in the graph for deletion.")
 
-    async def clear(self):
+    def clear(self):
         """
         Clear the graph by removing all nodes and edges.
         """
         self._graph.clear()
-        logger.info("Graph %s cleared.", self.namespace)
+        print(f"Graph {self.namespace} cleared.")
+
+    def reload(self):
+        """
+        Reload the graph from the GraphML file.
+        """
+        self.__post_init__()
diff --git a/graphgen/models/storage/json_storage.py b/graphgen/models/storage/json_storage.py
deleted file mode 100644
index e4f21e6c..00000000
--- a/graphgen/models/storage/json_storage.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import os
-
-from dataclasses import dataclass
-from graphgen.utils import logger, load_json, write_json
-from graphgen.models.storage.base_storage import BaseKVStorage
-
-
-@dataclass
-class JsonKVStorage(BaseKVStorage):
-    _data: dict[str, str] = None
-
-    def __post_init__(self):
-        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
-        self._data = load_json(self._file_name) or {}
-        logger.info("Load KV %s with %d data", self.namespace, len(self._data))
-
-    @property
-    def data(self):
-        return self._data
-
-    async def all_keys(self) -> list[str]:
-        return list(self._data.keys())
-
-    async def index_done_callback(self):
-        write_json(self._data, self._file_name)
-
-    async def get_by_id(self, id):
-        return self._data.get(id, None)
-
-    async def get_by_ids(self, ids, fields=None) -> list:
-        if fields is None:
-            return [self._data.get(id, None) for id in ids]
-        return [
-            (
-                {k: v for k, v in self._data[id].items() if k in fields}
-                if self._data.get(id, None)
-                else None
-            )
-            for id in ids
-        ]
-
-    async def filter_keys(self, data: list[str]) -> set[str]:
-        return {s for s in data if s not in self._data}
-
-    async def upsert(self, data: dict):
-        left_data = {k: v for k, v in data.items() if k not in self._data}
-        self._data.update(left_data)
-        return left_data
-
-    async def drop(self):
-        self._data = {}
diff --git a/graphgen/models/storage/kv/__init__.py b/graphgen/models/storage/kv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py
new file mode 100644
index 00000000..4af8d1e5
--- /dev/null
+++ b/graphgen/models/storage/kv/json_storage.py
@@ -0,0 +1,97 @@
+import os
+from dataclasses import dataclass
+from typing import Iterator, Tuple
+
+from graphgen.bases.base_storage import BaseKVStorage
+from graphgen.utils import load_json, write_json
+
+
+@dataclass
+class JsonKVStorage(BaseKVStorage):
+    _data: dict[str, dict] = None
+
+    def __post_init__(self):
+        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
+        self._data = load_json(self._file_name) or {}
+        print(f"Load KV {self.namespace} with {len(self._data)} data")
+
+    @property
+    def data(self):
+        return self._data
+
+    def all_keys(self) -> list[str]:
+        return list(self._data.keys())
+
+    def index_done_callback(self):
+        write_json(self._data, self._file_name)
+
+    def get_by_id(self, id):
+        return self._data.get(id, None)
+
+    def get_by_ids(self, ids, fields=None) -> list:
+        if fields is None:
+            return [self._data.get(id, None) for id in ids]
+        return [
+            (
+                {k: v for k, v in self._data[id].items() if k in fields}
+                if self._data.get(id, None)
+                else None
+            )
+            for id in ids
+        ]
+
+    def get_all(self) -> dict[str, dict]:
+        return self._data
+
+    def iter_items(self) -> Iterator[Tuple[str, dict]]:
+        """
+        Iterate over all items without loading everything into memory at once.
+        Returns an iterator of (key, value) tuples.
+        """
+        for key, value in self._data.items():
+            yield key, value
+
+    def get_batch(self, keys: list[str]) -> dict[str, dict]:
+        """
+        Get a batch of items by their keys.
+
+        :param keys: List of keys to retrieve.
+        :return: Dictionary of {key: value} for the requested keys.
+        """
+        return {key: self._data.get(key) for key in keys if key in self._data}
+
+    def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]:
+        """
+        Iterate over items in batches to avoid loading everything into memory.
+
+        :param batch_size: Number of items per batch.
+        :return: Iterator of dictionaries, each containing up to batch_size items.
+        """
+        batch = {}
+        count = 0
+        for key, value in self._data.items():
+            batch[key] = value
+            count += 1
+            if count >= batch_size:
+                yield batch
+                batch = {}
+                count = 0
+        if batch:
+            yield batch
+
+    def filter_keys(self, data: list[str]) -> set[str]:
+        return {s for s in data if s not in self._data}
+
+    def upsert(self, data: dict):
+        left_data = {k: v for k, v in data.items() if k not in self._data}
+        if left_data:
+            self._data.update(left_data)
+        return left_data
+
+    def drop(self):
+        if self._data:
+            self._data.clear()
+
+    def reload(self):
+        self._data = load_json(self._file_name) or {}
+        print(f"Reload KV {self.namespace} with {len(self._data)} data")
diff --git a/graphgen/models/storage/kv/rocksdb_storage.py b/graphgen/models/storage/kv/rocksdb_storage.py
new file mode 100644
index 00000000..45055b93
--- /dev/null
+++ b/graphgen/models/storage/kv/rocksdb_storage.py
@@ -0,0 +1,82 @@
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Set
+
+# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
+# pylint: disable=no-name-in-module
+from rocksdict import Rdict
+
+from graphgen.bases.base_storage import BaseKVStorage
+
+
+@dataclass
+class RocksDBKVStorage(BaseKVStorage):
+    _db: Rdict = None
+    _db_path: str = None
+
+    def __post_init__(self):
+        self._db_path = os.path.join(self.working_dir, f"{self.namespace}.db")
+        self._db = Rdict(self._db_path)
+        print(
+            f"RocksDBKVStorage initialized for namespace '{self.namespace}' at '{self._db_path}'"
+        )
+
+    @property
+    def data(self):
+        return self._db
+
+    def all_keys(self) -> List[str]:
+        return list(self._db.keys())
+
+    def index_done_callback(self):
+        self._db.flush()
+        print(f"RocksDB flushed for {self.namespace}")
+
+    def get_by_id(self, id: str) -> Any:
+        return self._db.get(id, None)
+
+    def get_by_ids(self, ids: List[str], fields: List[str] = None) -> List[Any]:
+        result = []
+        for index in ids:
+            item = self._db.get(index, None)
+            if item is None:
+                result.append(None)
+                continue
+
+            if fields is None:
+                result.append(item)
+            else:
+                result.append({k: v for k, v in item.items() if k in fields})
+        return result
+
+    def get_all(self) -> Dict[str, Dict]:
+        return dict(self._db)
+
+    def filter_keys(self, data: List[str]) -> Set[str]:
+        return {s for s in data if s not in self._db}
+
+    def upsert(self, data: Dict[str, Any]):
+        left_data = {}
+        for k, v in data.items():
+            if k not in self._db:
+                left_data[k] = v
+
+        if left_data:
+            for k, v in left_data.items():
+                self._db[k] = v
+            # if left_data is very large, it is recommended to use self._db.write_batch() for optimization
+
+        return left_data
+
+    def drop(self):
+        self._db.close()
+        Rdict.destroy(self._db_path)
+        self._db = Rdict(self._db_path)
+        print(f"Dropped RocksDB {self.namespace}")
+
+    def close(self):
+        if self._db:
+            self._db.close()
+
+    def reload(self):
+        """For databases that need reloading, RocksDB auto-manages this."""
diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py
new file mode 100644
index 00000000..2345b5b5
--- /dev/null
+++ b/graphgen/models/storage/rocksdb_cache.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
+# pylint: disable=no-name-in-module
+from rocksdict import Rdict
+
+
+class RocksDBCache:
+    def __init__(self, cache_dir: str):
+        self.db_path = Path(cache_dir)
+        self.db = Rdict(str(self.db_path))
+
+    def get(self, key: str) -> Optional[Any]:
+        return self.db.get(key)
+
+    def set(self, key: str, value: Any):
+        self.db[key] = value
+
+    def delete(self, key: str):
+        try:
+            del self.db[key]
+        except KeyError:
+            # If the key does not exist, do nothing (deletion is idempotent for caches)
+            pass
+
+    def close(self):
+        if hasattr(self, "db") and self.db is not None:
+            self.db.close()
+            self.db = None
+
+    def __del__(self):
+        # Ensure the database is closed when the object is destroyed
+        self.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.db.keys())
diff --git a/graphgen/models/strategy/base_strategy.py b/graphgen/models/strategy/base_strategy.py
deleted file mode 100644
index 70e0cc54..00000000
--- a/graphgen/models/strategy/base_strategy.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from dataclasses import dataclass
-
-@dataclass
-class BaseStrategy:
-    pass
diff --git a/graphgen/models/strategy/travserse_strategy.py b/graphgen/models/strategy/travserse_strategy.py
deleted file mode 100644
index 06882c5f..00000000
--- a/graphgen/models/strategy/travserse_strategy.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from dataclasses import dataclass, fields
-
-from graphgen.models.strategy.base_strategy import BaseStrategy
-
-
-@dataclass
-class TraverseStrategy(BaseStrategy):
-    # 生成的QA形式：原子、多跳、聚合型
-    qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
-    # 最大边数和最大token数方法中选择一个生效
-    expand_method: str = "max_tokens" # "max_width" or "max_tokens"
-    # 单向拓展还是双向拓展
-    bidirectional: bool = True
-    # 每个方向拓展的最大边数
-    max_extra_edges: int = 5
-    # 最长token数
-    max_tokens: int = 256
-    # 每个方向拓展的最大深度
-    max_depth: int = 2
-    # 同一层中选边的策略（如果是双向拓展，同一层指的是两边连接的边的集合）
-    edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
-    # 孤立节点的处理策略
-    isolated_node_strategy: str = "add" # "add" or "ignore"
-    loss_strategy: str = "only_edge"  # only_edge, both
-
-    def to_yaml(self):
-        strategy_dict = {}
-        for f in fields(self):
-            strategy_dict[f.name] = getattr(self, f.name)
-        return {"traverse_strategy": strategy_dict}
diff --git a/graphgen/models/text/chunk.py b/graphgen/models/text/chunk.py
deleted file mode 100644
index 9678949f..00000000
--- a/graphgen/models/text/chunk.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dataclasses import dataclass
-
-
-@dataclass
-class Chunk:
-    id : str
-    content: str
diff --git a/graphgen/models/text/text_pair.py b/graphgen/models/text/text_pair.py
deleted file mode 100644
index f9a971f1..00000000
--- a/graphgen/models/text/text_pair.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from dataclasses import dataclass
-
-@dataclass
-class TextPair:
-    """
-    A pair of input data.
-    """
-    question: str
-    answer: str
diff --git a/graphgen/models/tokenizer/__init__.py b/graphgen/models/tokenizer/__init__.py
new file mode 100644
index 00000000..6712f918
--- /dev/null
+++ b/graphgen/models/tokenizer/__init__.py
@@ -0,0 +1,50 @@
+from typing import List
+
+from graphgen.bases import BaseTokenizer
+
+from .tiktoken_tokenizer import TiktokenTokenizer
+
+try:
+    from transformers import AutoTokenizer
+
+    _HF_AVAILABLE = True
+except ImportError:
+    _HF_AVAILABLE = False
+
+
+def get_tokenizer_impl(tokenizer_name: str = "cl100k_base") -> BaseTokenizer:
+    import tiktoken
+
+    if tokenizer_name in tiktoken.list_encoding_names():
+        return TiktokenTokenizer(model_name=tokenizer_name)
+
+    # 2. HuggingFace
+    if _HF_AVAILABLE:
+        from .hf_tokenizer import HFTokenizer
+
+        return HFTokenizer(model_name=tokenizer_name)
+
+    raise ValueError(
+        f"Unknown tokenizer {tokenizer_name} and HuggingFace not available."
+    )
+
+
+class Tokenizer(BaseTokenizer):
+    """
+    Encapsulates different tokenization implementations based on the specified model name.
+    """
+
+    def __init__(self, model_name: str = "cl100k_base"):
+        super().__init__(model_name)
+        if not self.model_name:
+            raise ValueError("TOKENIZER_MODEL must be specified in the ENV variables.")
+        self._impl = get_tokenizer_impl(self.model_name)
+
+    def encode(self, text: str) -> List[int]:
+        return self._impl.encode(text)
+
+    def decode(self, token_ids: List[int]) -> str:
+        return self._impl.decode(token_ids)
+
+    def count_tokens(self, text: str) -> int:
+        return self._impl.count_tokens(text)
diff --git a/graphgen/models/tokenizer/hf_tokenizer.py b/graphgen/models/tokenizer/hf_tokenizer.py
new file mode 100644
index 00000000..c43ddd7d
--- /dev/null
+++ b/graphgen/models/tokenizer/hf_tokenizer.py
@@ -0,0 +1,17 @@
+from typing import List
+
+from transformers import AutoTokenizer
+
+from graphgen.bases import BaseTokenizer
+
+
+class HFTokenizer(BaseTokenizer):
+    def __init__(self, model_name: str = "cl100k_base"):
+        super().__init__(model_name)
+        self.enc = AutoTokenizer.from_pretrained(self.model_name)
+
+    def encode(self, text: str) -> List[int]:
+        return self.enc.encode(text, add_special_tokens=False)
+
+    def decode(self, token_ids: List[int]) -> str:
+        return self.enc.decode(token_ids, skip_special_tokens=True)
diff --git a/graphgen/models/tokenizer/tiktoken_tokenizer.py b/graphgen/models/tokenizer/tiktoken_tokenizer.py
new file mode 100644
index 00000000..6145d070
--- /dev/null
+++ b/graphgen/models/tokenizer/tiktoken_tokenizer.py
@@ -0,0 +1,17 @@
+from typing import List
+
+import tiktoken
+
+from graphgen.bases import BaseTokenizer
+
+
+class TiktokenTokenizer(BaseTokenizer):
+    def __init__(self, model_name: str = "cl100k_base"):
+        super().__init__(model_name)
+        self.enc = tiktoken.get_encoding(self.model_name)
+
+    def encode(self, text: str) -> List[int]:
+        return self.enc.encode(text)
+
+    def decode(self, token_ids: List[int]) -> str:
+        return self.enc.decode(token_ids)
diff --git a/graphgen/models/vis/__init__.py b/graphgen/models/vis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/models/vis/community_visualizer.py b/graphgen/models/vis/community_visualizer.py
new file mode 100644
index 00000000..05551014
--- /dev/null
+++ b/graphgen/models/vis/community_visualizer.py
@@ -0,0 +1,48 @@
+from dataclasses import dataclass
+from typing import Dict
+
+import matplotlib.pyplot as plt
+import networkx as nx
+
+
+@dataclass
+class Visualizer:
+    """
+    Class for visualizing graphs using NetworkX and Matplotlib.
+    """
+
+    graph: nx.Graph = None
+    communities: Dict[str, int] = None
+    layout: str = "spring"
+    max_nodes: int = 1000
+    node_size: int = 10
+    alpha: float = 0.6
+
+    def visualize(self, save_path: str = None):
+        n = self.graph.number_of_nodes()
+        if self.layout == "spring":
+            k = max(0.1, 1.0 / (n**0.5))
+            pos = nx.spring_layout(self.graph, k=k, seed=42)
+        else:
+            raise ValueError(f"Unknown layout: {self.layout}")
+
+        plt.figure(figsize=(10, 10))
+
+        node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()]
+
+        nx.draw_networkx_nodes(
+            self.graph,
+            pos,
+            node_size=self.node_size,
+            node_color=node_colors,
+            cmap=plt.cm.tab20,
+            alpha=self.alpha,
+        )
+        nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2)
+        plt.axis("off")
+
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            print("Saved to", save_path)
+        else:
+            plt.show()
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
index 8ef14fdc..d8e78927 100644
--- a/graphgen/operators/__init__.py
+++ b/graphgen/operators/__init__.py
@@ -1,16 +1,21 @@
-from .extract_kg import extract_kg
-from .quiz import quiz
-from .judge import judge_statement, skip_judge_statement
-from .search_wikipedia import search_wikipedia
-from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically, traverse_graph_for_multi_hop
+from .build_kg import BuildKGService
+from .chunk import ChunkService
+from .extract import ExtractService
+from .generate import GenerateService
+from .judge import JudgeService
+from .partition import PartitionService
+from .quiz import QuizService
+from .read import read, read_files
+from .search import SearchService
 
-__all__ = [
-    "extract_kg",
-    "quiz",
-    "judge_statement",
-    "skip_judge_statement",
-    "search_wikipedia",
-    "traverse_graph_by_edge",
-    "traverse_graph_atomically",
-    "traverse_graph_for_multi_hop"
-]
+operators = {
+    "read": read,
+    "chunk": ChunkService,
+    "build_kg": BuildKGService,
+    "quiz": QuizService,
+    "judge": JudgeService,
+    "extract": ExtractService,
+    "search": SearchService,
+    "partition": PartitionService,
+    "generate": GenerateService,
+}
diff --git a/graphgen/operators/build_kg/__init__.py b/graphgen/operators/build_kg/__init__.py
new file mode 100644
index 00000000..a8b22ce9
--- /dev/null
+++ b/graphgen/operators/build_kg/__init__.py
@@ -0,0 +1 @@
+from .build_kg_service import BuildKGService
diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py
new file mode 100644
index 00000000..76dba8a9
--- /dev/null
+++ b/graphgen/operators/build_kg/build_kg_service.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator
+from graphgen.bases.datatypes import Chunk
+from graphgen.common import init_llm, init_storage
+from graphgen.utils import logger
+
+from .build_mm_kg import build_mm_kg
+from .build_omics_kg import build_omics_kg
+from .build_text_kg import build_text_kg
+
+
+class BuildKGService(BaseOperator):
+    def __init__(self, working_dir: str = "cache"):
+        super().__init__(working_dir=working_dir, op_name="build_kg_service")
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.graph_storage: BaseGraphStorage = init_storage(
+            backend="kuzu", working_dir=working_dir, namespace="graph"
+        )
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        docs = batch.to_dict(orient="records")
+        docs = [Chunk.from_dict(doc["_chunk_id"], doc) for doc in docs]
+
+        # consume the chunks and build kg
+        self.build_kg(docs)
+        return pd.DataFrame([{"status": "kg_building_completed"}])
+
+    def build_kg(self, chunks: List[Chunk]) -> None:
+        """
+        Build knowledge graph (KG) and merge into kg_instance
+        """
+        text_chunks = [chunk for chunk in chunks if chunk.type == "text"]
+        mm_chunks = [
+            chunk
+            for chunk in chunks
+            if chunk.type in ("image", "video", "table", "formula")
+        ]
+        omics_chunks = [
+            chunk
+            for chunk in chunks
+            if chunk.type in ("dna", "rna", "protein")
+        ]
+
+        if len(text_chunks) == 0:
+            logger.info("All text chunks are already in the storage")
+        else:
+            logger.info("[Text Entity and Relation Extraction] processing ...")
+            build_text_kg(
+                llm_client=self.llm_client,
+                kg_instance=self.graph_storage,
+                chunks=text_chunks,
+            )
+        if len(mm_chunks) == 0:
+            logger.info("All multi-modal chunks are already in the storage")
+        else:
+            logger.info("[Multi-modal Entity and Relation Extraction] processing ...")
+            build_mm_kg(
+                llm_client=self.llm_client,
+                kg_instance=self.graph_storage,
+                chunks=mm_chunks,
+            )
+        if len(omics_chunks) == 0:
+            logger.info("All omics chunks are already in the storage")
+        else:
+            logger.info(
+                "[Omics Entity and Relation Extraction] processing %d chunks (DNA/RNA/protein)...",
+                len(omics_chunks)
+            )
+            build_omics_kg(
+                llm_client=self.llm_client,
+                kg_instance=self.graph_storage,
+                chunks=omics_chunks,
+            )
+
+        self.graph_storage.index_done_callback()
diff --git a/graphgen/operators/build_kg/build_mm_kg.py b/graphgen/operators/build_kg/build_mm_kg.py
new file mode 100644
index 00000000..ee0459ea
--- /dev/null
+++ b/graphgen/operators/build_kg/build_mm_kg.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from typing import List
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import MMKGBuilder
+from graphgen.utils import run_concurrent
+
+
+def build_mm_kg(
+    llm_client: BaseLLMWrapper,
+    kg_instance: BaseGraphStorage,
+    chunks: List[Chunk],
+):
+    """
+    Build multi-modal KG and merge into kg_instance
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param chunks
+    :return:
+    """
+    mm_builder = MMKGBuilder(llm_client=llm_client)
+
+    results = run_concurrent(
+        mm_builder.extract,
+        chunks,
+        desc="[2/4] Extracting entities and relationships from multi-modal chunks",
+        unit="chunk",
+    )
+
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+
+    run_concurrent(
+        lambda kv: mm_builder.merge_nodes(kv, kg_instance=kg_instance),
+        list(nodes.items()),
+        desc="Inserting entities into storage",
+    )
+
+    run_concurrent(
+        lambda kv: mm_builder.merge_edges(kv, kg_instance=kg_instance),
+        list(edges.items()),
+        desc="Inserting relationships into storage",
+    )
diff --git a/graphgen/operators/build_kg/build_omics_kg.py b/graphgen/operators/build_kg/build_omics_kg.py
new file mode 100644
index 00000000..503bb7eb
--- /dev/null
+++ b/graphgen/operators/build_kg/build_omics_kg.py
@@ -0,0 +1,51 @@
+from collections import defaultdict
+from typing import List
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import OmicsKGBuilder
+from graphgen.utils import run_concurrent
+
+
+def build_omics_kg(
+    llm_client: BaseLLMWrapper,
+    kg_instance: BaseGraphStorage,
+    chunks: List[Chunk],
+):
+    """
+    Build knowledge graph from multi-omics chunks (DNA, RNA, protein).
+
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance: Graph storage instance
+    :param chunks: List of omics chunks
+    :return: None
+    """
+    kg_builder = OmicsKGBuilder(llm_client=llm_client, max_loop=3)
+
+    results = run_concurrent(
+        kg_builder.extract,
+        chunks,
+        desc="[2/4] Extracting entities and relationships from omics chunks",
+        unit="chunk",
+    )
+
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+
+    run_concurrent(
+        lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance),
+        list(nodes.items()),
+        desc="Inserting omics entities into storage",
+    )
+
+    run_concurrent(
+        lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance),
+        list(edges.items()),
+        desc="Inserting omics relationships into storage",
+    )
diff --git a/graphgen/operators/build_kg/build_text_kg.py b/graphgen/operators/build_kg/build_text_kg.py
new file mode 100644
index 00000000..1b5a8762
--- /dev/null
+++ b/graphgen/operators/build_kg/build_text_kg.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from typing import List
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import LightRAGKGBuilder
+from graphgen.utils import run_concurrent
+
+
+def build_text_kg(
+    llm_client: BaseLLMWrapper,
+    kg_instance: BaseGraphStorage,
+    chunks: List[Chunk],
+):
+    """
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param chunks
+    :return:
+    """
+
+    kg_builder = LightRAGKGBuilder(llm_client=llm_client, max_loop=3)
+
+    results = run_concurrent(
+        kg_builder.extract,
+        chunks,
+        desc="[2/4]Extracting entities and relationships from chunks",
+        unit="chunk",
+    )
+
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+
+    run_concurrent(
+        lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance),
+        list(nodes.items()),
+        desc="Inserting entities into storage",
+    )
+
+    run_concurrent(
+        lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance),
+        list(edges.items()),
+        desc="Inserting relationships into storage",
+    )
diff --git a/graphgen/operators/chunk/__init__.py b/graphgen/operators/chunk/__init__.py
new file mode 100644
index 00000000..f2f116f7
--- /dev/null
+++ b/graphgen/operators/chunk/__init__.py
@@ -0,0 +1 @@
+from .chunk_service import ChunkService
diff --git a/graphgen/operators/chunk/chunk_service.py b/graphgen/operators/chunk/chunk_service.py
new file mode 100644
index 00000000..b6775764
--- /dev/null
+++ b/graphgen/operators/chunk/chunk_service.py
@@ -0,0 +1,101 @@
+import os
+from functools import lru_cache
+from typing import Union
+
+import pandas as pd
+
+from graphgen.bases import BaseOperator
+from graphgen.common import init_storage
+from graphgen.models import (
+    ChineseRecursiveTextSplitter,
+    RecursiveCharacterSplitter,
+    Tokenizer,
+)
+from graphgen.utils import compute_content_hash, detect_main_language
+
+_MAPPING = {
+    "en": RecursiveCharacterSplitter,
+    "zh": ChineseRecursiveTextSplitter,
+}
+
+SplitterT = Union[RecursiveCharacterSplitter, ChineseRecursiveTextSplitter]
+
+
+@lru_cache(maxsize=None)
+def _get_splitter(language: str, frozen_kwargs: frozenset) -> SplitterT:
+    cls = _MAPPING[language]
+    kwargs = dict(frozen_kwargs)
+    return cls(**kwargs)
+
+
+def split_chunks(text: str, language: str = "en", **kwargs) -> list:
+    if language not in _MAPPING:
+        raise ValueError(
+            f"Unsupported language: {language}. "
+            f"Supported languages are: {list(_MAPPING.keys())}"
+        )
+    frozen_kwargs = frozenset(
+        (k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items()
+    )
+    splitter = _get_splitter(language, frozen_kwargs)
+    return splitter.split_text(text)
+
+
+class ChunkService(BaseOperator):
+    def __init__(self, working_dir: str = "cache", **chunk_kwargs):
+        super().__init__(working_dir=working_dir, op_name="chunk_service")
+        tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base")
+        self.tokenizer_instance: Tokenizer = Tokenizer(model_name=tokenizer_model)
+        self.chunk_storage = init_storage(
+            backend="rocksdb",
+            working_dir=working_dir,
+            namespace="chunk",
+        )
+        self.chunk_kwargs = chunk_kwargs
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        docs = batch.to_dict(orient="records")
+        return pd.DataFrame(self.chunk_documents(docs))
+
+    def chunk_documents(self, new_docs: list) -> list:
+        chunks = []
+        for doc in new_docs:
+            doc_id = doc.get("_doc_id")
+            doc_type = doc.get("type")
+
+            if doc_type == "text":
+                doc_language = detect_main_language(doc["content"])
+                text_chunks = split_chunks(
+                    doc["content"],
+                    language=doc_language,
+                    **self.chunk_kwargs,
+                )
+
+                chunks.extend(
+                    [
+                        {
+                            "_chunk_id": compute_content_hash(
+                                chunk_text, prefix="chunk-"
+                            ),
+                            "content": chunk_text,
+                            "type": "text",
+                            "_doc_id": doc_id,
+                            "length": len(self.tokenizer_instance.encode(chunk_text))
+                            if self.tokenizer_instance
+                            else len(chunk_text),
+                            "language": doc_language,
+                        }
+                        for chunk_text in text_chunks
+                    ]
+                )
+            else:
+                # other types of documents(images, sequences) are not chunked
+                chunks.append(
+                    {
+                        "_chunk_id": doc_id.replace("doc-", f"{doc_type}-"),
+                        **doc,
+                    }
+                )
+        self.chunk_storage.upsert({chunk["_chunk_id"]: chunk for chunk in chunks})
+        self.chunk_storage.index_done_callback()
+        return chunks
diff --git a/graphgen/operators/evaluate/__init__.py b/graphgen/operators/evaluate/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/operators/evaluate/evaluate.py b/graphgen/operators/evaluate/evaluate.py
new file mode 100644
index 00000000..fdbfbf82
--- /dev/null
+++ b/graphgen/operators/evaluate/evaluate.py
@@ -0,0 +1,177 @@
+# TODO: this module needs refactoring to merge into GraphGen framework
+"""Evaluate the quality of the generated text using various metrics"""
+
+import argparse
+import json
+import os
+
+import pandas as pd
+from dotenv import load_dotenv
+
+from graphgen.bases.datatypes import QAPair
+from graphgen.models import (
+    LengthEvaluator,
+    MTLDEvaluator,
+    RewardEvaluator,
+    UniEvaluator,
+)
+from graphgen.utils import logger, set_logger
+
+sys_path = os.path.abspath(os.path.dirname(__file__))
+set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
+
+load_dotenv()
+
+
+def evaluate_length(corpus, tokenizer_name):
+    length_evaluator = LengthEvaluator(tokenizer_name=tokenizer_name)
+    logger.info("Length evaluator loaded")
+    scores = length_evaluator.get_average_score(corpus)
+    logger.info("Length scores: %s", scores)
+    return scores
+
+
+def evaluate_mtld(corpus):
+    mtld_evaluator = MTLDEvaluator()
+    logger.info("MTLD evaluator loaded")
+    scores = mtld_evaluator.get_average_score(corpus)
+    logger.info("MTLD scores: %s", scores)
+    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
+    logger.info("MTLD min max scores: %s", min_max_scores)
+    return scores, min_max_scores
+
+
+def evaluate_reward(corpus, reward_model_names):
+    scores = []
+    for reward_name in reward_model_names:
+        reward_evaluator = RewardEvaluator(reward_name=reward_name)
+        logger.info("Loaded reward model: %s", reward_name)
+        average_score = reward_evaluator.get_average_score(corpus)
+        logger.info("%s scores: %s", reward_name, average_score)
+        min_max_scores = reward_evaluator.get_min_max_score(corpus)
+        logger.info("%s min max scores: %s", reward_name, min_max_scores)
+        scores.append(
+            {
+                "reward_name": reward_name.split("/")[-1],
+                "score": average_score,
+                "min_max_scores": min_max_scores,
+            }
+        )
+        del reward_evaluator
+        clean_gpu_cache()
+    return scores
+
+
+def evaluate_uni(corpus, uni_model_name):
+    uni_evaluator = UniEvaluator(model_name=uni_model_name)
+    logger.info("Uni evaluator loaded with model %s", uni_model_name)
+    uni_scores = uni_evaluator.get_average_score(corpus)
+    for key, value in uni_scores.items():
+        logger.info("Uni %s scores: %s", key, value)
+    min_max_scores = uni_evaluator.get_min_max_score(corpus)
+    for key, value in min_max_scores.items():
+        logger.info("Uni %s min max scores: %s", key, value)
+    del uni_evaluator
+    clean_gpu_cache()
+    return (
+        uni_scores["naturalness"],
+        uni_scores["coherence"],
+        uni_scores["understandability"],
+        min_max_scores["naturalness"],
+        min_max_scores["coherence"],
+        min_max_scores["understandability"],
+    )
+
+
+def clean_gpu_cache():
+    import torch
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    import torch.multiprocessing as mp
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--folder", type=str, default="cache/data", help="folder to load data"
+    )
+    parser.add_argument(
+        "--output", type=str, default="cache/output", help="path to save output"
+    )
+
+    parser.add_argument(
+        "--tokenizer", type=str, default="cl100k_base", help="tokenizer name"
+    )
+    parser.add_argument(
+        "--reward",
+        type=str,
+        default="OpenAssistant/reward-model-deberta-v3-large-v2",
+        help="Comma-separated list of reward models",
+    )
+    parser.add_argument(
+        "--uni", type=str, default="MingZhong/unieval-sum", help="uni model name"
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.folder):
+        raise ValueError(f"Folder {args.folder} does not exist")
+
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+
+    reward_models = args.reward.split(",")
+
+    results = []
+
+    logger.info("Data loaded from %s", args.folder)
+    mp.set_start_method("spawn")
+
+    for file in os.listdir(args.folder):
+        if file.endswith(".json"):
+            logger.info("Processing %s", file)
+            with open(os.path.join(args.folder, file), "r", encoding="utf-8") as f:
+                data = json.load(f)
+            data = [
+                QAPair(question=data[key]["question"], answer=data[key]["answer"])
+                for key in data
+            ]
+
+            length_scores = evaluate_length(data, args.tokenizer)
+            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
+            reward_scores = evaluate_reward(data, reward_models)
+            (
+                uni_naturalness_scores,
+                uni_coherence_scores,
+                uni_understandability_scores,
+                min_max_uni_naturalness_scores,
+                min_max_uni_coherence_scores,
+                min_max_uni_understandability_scores,
+            ) = evaluate_uni(data, args.uni)
+
+            result = {
+                "file": file,
+                "number": len(data),
+                "length": length_scores,
+                "mtld": mtld_scores,
+                "mtld_min_max": min_max_mtld_scores,
+                "uni_naturalness": uni_naturalness_scores,
+                "uni_coherence": uni_coherence_scores,
+                "uni_understandability": uni_understandability_scores,
+                "uni_naturalness_min_max": min_max_uni_naturalness_scores,
+                "uni_coherence_min_max": min_max_uni_coherence_scores,
+                "uni_understandability_min_max": min_max_uni_understandability_scores,
+            }
+            for reward_score in reward_scores:
+                result[reward_score["reward_name"]] = reward_score["score"]
+                result[f"{reward_score['reward_name']}_min_max"] = reward_score[
+                    "min_max_scores"
+                ]
+
+            results.append(result)
+
+    results = pd.DataFrame(results)
+    results.to_csv(os.path.join(args.output, "evaluation.csv"), index=False)
diff --git a/graphgen/operators/extract/__init__.py b/graphgen/operators/extract/__init__.py
new file mode 100644
index 00000000..6c7c2b94
--- /dev/null
+++ b/graphgen/operators/extract/__init__.py
@@ -0,0 +1 @@
+from .extract_service import ExtractService
diff --git a/graphgen/operators/extract/extract_service.py b/graphgen/operators/extract/extract_service.py
new file mode 100644
index 00000000..33987fcb
--- /dev/null
+++ b/graphgen/operators/extract/extract_service.py
@@ -0,0 +1,45 @@
+import json
+
+import pandas as pd
+
+from graphgen.bases import BaseLLMWrapper, BaseOperator
+from graphgen.common import init_llm
+from graphgen.models.extractor import SchemaGuidedExtractor
+from graphgen.utils import logger, run_concurrent
+
+
+class ExtractService(BaseOperator):
+    def __init__(self, working_dir: str = "cache", **extract_kwargs):
+        super().__init__(working_dir=working_dir, op_name="extract_service")
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.extract_kwargs = extract_kwargs
+        self.method = self.extract_kwargs.get("method")
+        if self.method == "schema_guided":
+            schema_file = self.extract_kwargs.get("schema_path")
+            with open(schema_file, "r", encoding="utf-8") as f:
+                schema = json.load(f)
+            self.extractor = SchemaGuidedExtractor(self.llm_client, schema)
+        else:
+            raise ValueError(f"Unsupported extraction method: {self.method}")
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        items = batch.to_dict(orient="records")
+        return pd.DataFrame(self.extract(items))
+
+    def extract(self, items: list[dict]) -> list[dict]:
+
+        logger.info("Start extracting information from %d items", len(items))
+
+        results = run_concurrent(
+            self.extractor.extract,
+            items,
+            desc="Extracting information",
+            unit="item",
+        )
+        results = self.extractor.merge_extractions(results)
+
+        results = [
+            {"_extract_id": key, "extracted_data": value}
+            for key, value in results.items()
+        ]
+        return results
diff --git a/graphgen/operators/extract_kg.py b/graphgen/operators/extract_kg.py
deleted file mode 100644
index 3fad5525..00000000
--- a/graphgen/operators/extract_kg.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import re
-import asyncio
-from typing import List
-from collections import defaultdict
-
-import gradio as gr
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import Chunk, OpenAIModel, Tokenizer
-from graphgen.models.storage.base_storage import BaseGraphStorage
-from graphgen.templates import KG_EXTRACTION_PROMPT
-from graphgen.utils import (logger, pack_history_conversations, split_string_by_multi_markers,
-                            handle_single_entity_extraction, handle_single_relationship_extraction,
-                            detect_if_chinese)
-from graphgen.operators.merge_kg import merge_nodes, merge_edges
-
-
-# pylint: disable=too-many-statements
-async def extract_kg(
-        llm_client: OpenAIModel,
-        kg_instance: BaseGraphStorage,
-        tokenizer_instance: Tokenizer,
-        chunks: List[Chunk],
-        progress_bar: gr.Progress = None,
-        max_concurrent: int = 1000
-):
-    """
-    :param llm_client: Synthesizer LLM model to extract entities and relationships
-    :param kg_instance
-    :param tokenizer_instance
-    :param chunks
-    :param progress_bar: Gradio progress bar to show the progress of the extraction
-    :param max_concurrent
-    :return:
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def _process_single_content(chunk: Chunk, max_loop: int = 3):
-        async with semaphore:
-            chunk_id = chunk.id
-            content = chunk.content
-            if detect_if_chinese(content):
-                language = "Chinese"
-            else:
-                language = "English"
-            KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
-
-            hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
-                **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
-            )
-
-            final_result = await llm_client.generate_answer(hint_prompt)
-            logger.info('First result: %s', final_result)
-
-            history = pack_history_conversations(hint_prompt, final_result)
-            for loop_index in range(max_loop):
-                if_loop_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"],
-                    history=history
-                )
-                if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
-                if if_loop_result != "yes":
-                    break
-
-                glean_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"],
-                    history=history
-                )
-                logger.info('Loop %s glean: %s', loop_index, glean_result)
-
-                history += pack_history_conversations(KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result)
-                final_result += glean_result
-                if loop_index == max_loop - 1:
-                    break
-
-            records = split_string_by_multi_markers(
-                final_result,
-                [
-                KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
-                KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"]],
-            )
-
-            nodes = defaultdict(list)
-            edges = defaultdict(list)
-
-            for record in records:
-                record = re.search(r"\((.*)\)", record)
-                if record is None:
-                    continue
-                record = record.group(1) # 提取括号内的内容
-                record_attributes = split_string_by_multi_markers(
-                    record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
-                )
-
-                entity = await handle_single_entity_extraction(record_attributes, chunk_id)
-                if entity is not None:
-                    nodes[entity["entity_name"]].append(entity)
-                    continue
-                relation = await handle_single_relationship_extraction(record_attributes, chunk_id)
-                if relation is not None:
-                    edges[(relation["src_id"], relation["tgt_id"])].append(relation)
-            return dict(nodes), dict(edges)
-
-    results = []
-    chunk_number = len(chunks)
-    async for result in tqdm_async(
-        asyncio.as_completed([_process_single_content(c) for c in chunks]),
-        total=len(chunks),
-        desc="[3/4]Extracting entities and relationships from chunks",
-        unit="chunk",
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks")
-            results.append(await result)
-            if progress_bar is not None and len(results) == chunk_number:
-                progress_bar(1, desc="[3/4]Extracting entities and relationships from chunks")
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while extracting entities and relationships from chunks: %s", e)
-
-    nodes = defaultdict(list)
-    edges = defaultdict(list)
-    for n, e in results:
-        for k, v in n.items():
-            nodes[k].extend(v)
-        for k, v in e.items():
-            edges[tuple(sorted(k))].extend(v)
-
-    await merge_nodes(nodes, kg_instance, llm_client, tokenizer_instance)
-    await merge_edges(edges, kg_instance, llm_client, tokenizer_instance)
-
-    return kg_instance
diff --git a/graphgen/operators/generate/__init__.py b/graphgen/operators/generate/__init__.py
new file mode 100644
index 00000000..04057ce6
--- /dev/null
+++ b/graphgen/operators/generate/__init__.py
@@ -0,0 +1 @@
+from .generate_service import GenerateService
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
new file mode 100644
index 00000000..7ad52dec
--- /dev/null
+++ b/graphgen/operators/generate/generate_service.py
@@ -0,0 +1,71 @@
+import pandas as pd
+
+from graphgen.bases import BaseLLMWrapper, BaseOperator
+from graphgen.common import init_llm
+from graphgen.models import (
+    AggregatedGenerator,
+    AtomicGenerator,
+    CoTGenerator,
+    MultiHopGenerator,
+    OmicsQAGenerator,
+    VQAGenerator,
+)
+from graphgen.utils import logger, run_concurrent
+
+
+class GenerateService(BaseOperator):
+    """
+    Generate question-answer pairs based on nodes and edges.
+    """
+
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        method: str = "aggregated",
+        data_format: str = "ChatML",
+    ):
+        super().__init__(working_dir=working_dir, op_name="generate_service")
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+
+        self.method = method
+        self.data_format = data_format
+
+        if self.method == "atomic":
+            self.generator = AtomicGenerator(self.llm_client)
+        elif self.method == "aggregated":
+            self.generator = AggregatedGenerator(self.llm_client)
+        elif self.method == "multi_hop":
+            self.generator = MultiHopGenerator(self.llm_client)
+        elif self.method == "cot":
+            self.generator = CoTGenerator(self.llm_client)
+        elif self.method == "omics_qa":
+            self.generator = OmicsQAGenerator(self.llm_client)
+        elif self.method in ["vqa"]:
+            self.generator = VQAGenerator(self.llm_client)
+        else:
+            raise ValueError(f"Unsupported generation mode: {method}")
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        items = batch.to_dict(orient="records")
+        return pd.DataFrame(self.generate(items))
+
+    def generate(self, items: list[dict]) -> list[dict]:
+        """
+        Generate question-answer pairs based on nodes and edges.
+        :param items
+        :return: QA pairs
+        """
+        logger.info("[Generation] mode: %s, batches: %d", self.method, len(items))
+        items = [(item["nodes"], item["edges"]) for item in items]
+        results = run_concurrent(
+            self.generator.generate,
+            items,
+            desc="[4/4]Generating QAs",
+            unit="batch",
+        )
+
+        results = self.generator.format_generation_results(
+            results, output_data_format=self.data_format
+        )
+
+        return results
diff --git a/graphgen/operators/judge.py b/graphgen/operators/judge.py
deleted file mode 100644
index 0292e1e4..00000000
--- a/graphgen/operators/judge.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import math
-import asyncio
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import NetworkXStorage, OpenAIModel, JsonKVStorage
-from graphgen.utils import logger, yes_no_loss_entropy
-from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
-
-
-async def judge_statement( # pylint: disable=too-many-statements
-        trainee_llm_client: OpenAIModel,
-        graph_storage: NetworkXStorage,
-        rephrase_storage: JsonKVStorage,
-        re_judge: bool = False,
-        max_concurrent: int = 1000) -> NetworkXStorage:
-    """
-    Get all edges and nodes and judge them
-
-    :param trainee_llm_client: judge the statements to get comprehension loss
-    :param graph_storage: graph storage instance
-    :param rephrase_storage: rephrase storage instance
-    :param re_judge: re-judge the relations
-    :param max_concurrent: max concurrent
-    :return:
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def _judge_single_relation(
-        edge: tuple,
-    ):
-        async with semaphore:
-            source_id = edge[0]
-            target_id = edge[1]
-            edge_data = edge[2]
-
-            if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
-                return source_id, target_id, edge_data
-
-            description = edge_data["description"]
-
-            try:
-                descriptions = await rephrase_storage.get_by_id(description)
-                assert descriptions is not None
-
-                judgements = []
-                gts = [gt for _, gt in descriptions]
-                for description, gt in descriptions:
-                    judgement = await trainee_llm_client.generate_topk_per_token(
-                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
-                    )
-                    judgements.append(judgement[0].top_candidates)
-
-                loss = yes_no_loss_entropy(judgements, gts)
-
-                logger.info("Edge %s -> %s description: %s loss: %s", source_id, target_id, description, loss)
-
-                edge_data["loss"] = loss
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error in judging relation %s -> %s: %s", source_id, target_id, e)
-                logger.info("Use default loss 0.1")
-                edge_data["loss"] = -math.log(0.1)
-
-            await graph_storage.update_edge(source_id, target_id, edge_data)
-            return source_id, target_id, edge_data
-
-    edges = await graph_storage.get_all_edges()
-
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_judge_single_relation(edge) for edge in edges]),
-            total=len(edges),
-            desc="Judging relations"
-    ):
-        results.append(await result)
-
-    async def _judge_single_entity(
-        node: tuple,
-    ):
-        async with semaphore:
-            node_id = node[0]
-            node_data = node[1]
-
-            if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
-                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
-                return node_id, node_data
-
-            description = node_data["description"]
-
-            try:
-                descriptions = await rephrase_storage.get_by_id(description)
-                assert descriptions is not None
-
-                judgements = []
-                gts = [gt for _, gt in descriptions]
-                for description, gt in descriptions:
-                    judgement = await trainee_llm_client.generate_topk_per_token(
-                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
-                    )
-                    judgements.append(judgement[0].top_candidates)
-
-                loss = yes_no_loss_entropy(judgements, gts)
-
-                logger.info("Node %s description: %s loss: %s", node_id, description, loss)
-
-                node_data["loss"] = loss
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error in judging entity %s: %s", node_id, e)
-                logger.info("Use default loss 0.1")
-                node_data["loss"] = -math.log(0.1)
-
-            await graph_storage.update_node(node_id, node_data)
-            return node_id, node_data
-
-    nodes = await graph_storage.get_all_nodes()
-
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_judge_single_entity(node) for node in nodes]),
-            total=len(nodes),
-            desc="Judging entities"
-    ):
-        results.append(await result)
-
-    return graph_storage
-
-async def skip_judge_statement(
-        graph_storage: NetworkXStorage,
-        max_concurrent: int = 1000
-):
-    """
-    Skip the judgement of the statement
-    :param graph_storage: graph storage instance
-    :param max_concurrent: max concurrent
-    :return:
-    """
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def _skip_single_relation(
-        edge: tuple,
-    ):
-        async with semaphore:
-            source_id = edge[0]
-            target_id = edge[1]
-            edge_data = edge[2]
-
-            if "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
-                return source_id, target_id, edge_data
-
-            edge_data["loss"] = -math.log(0.1)
-            await graph_storage.update_edge(source_id, target_id, edge_data)
-            return source_id, target_id, edge_data
-
-    edges = await graph_storage.get_all_edges()
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_skip_single_relation(edge) for edge in edges]),
-            total=len(edges),
-            desc="Skipping judgement of relations"
-    ):
-        results.append(await result)
-
-    async def _skip_single_entity(
-        node: tuple,
-    ):
-        async with semaphore:
-            node_id = node[0]
-            node_data = node[1]
-
-            if "loss" in node_data and node_data["loss"] is not None:
-                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
-                return node_id, node_data
-
-            node_data["loss"] = -math.log(0.1)
-            await graph_storage.update_node(node_id, node_data)
-            return node_id, node_data
-
-    nodes = await graph_storage.get_all_nodes()
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_skip_single_entity(node) for node in nodes]),
-            total=len(nodes),
-            desc="Skipping judgement of entities"
-    ):
-        results.append(await result)
-
-    return graph_storage
diff --git a/graphgen/operators/judge/__init__.py b/graphgen/operators/judge/__init__.py
new file mode 100644
index 00000000..32ccf5c2
--- /dev/null
+++ b/graphgen/operators/judge/__init__.py
@@ -0,0 +1 @@
+from .judge_service import JudgeService
diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py
new file mode 100644
index 00000000..35797084
--- /dev/null
+++ b/graphgen/operators/judge/judge_service.py
@@ -0,0 +1,70 @@
+import math
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator
+from graphgen.common import init_llm, init_storage
+from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
+from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
+
+
+class JudgeService(BaseOperator):
+    """Service for judging graph edges and nodes using a trainee LLM."""
+
+    def __init__(self, working_dir: str = "cache"):
+        super().__init__(working_dir=working_dir, op_name="judge_service")
+        self.llm_client: BaseLLMWrapper = init_llm("trainee")
+        self.graph_storage: BaseGraphStorage = init_storage(
+            backend="kuzu",
+            working_dir=working_dir,
+            namespace="graph",
+        )
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        items = batch.to_dict(orient="records")
+        self.graph_storage.reload()
+        self.judge(items)
+        return pd.DataFrame([{"status": "judging_completed"}])
+
+    async def _process_single_judge(self, item: dict) -> dict:
+        description = item["description"]
+        try:
+            judgement = await self.llm_client.generate_topk_per_token(
+                STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(statement=description)
+            )
+            top_candidates = judgement[0].top_candidates
+            gt = item.get("ground_truth", "yes")
+            loss = yes_no_loss_entropy([top_candidates], [gt])
+            logger.debug("Description: %s Loss: %s", description, loss)
+            item["loss"] = loss
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error in judging description: %s", e)
+            logger.info("Use default loss 0.1")
+            item["loss"] = -math.log(0.1)
+        return item
+
+    def judge(self, items: list[dict]) -> None:
+        """
+        Judge the description in the item and compute the loss.
+        """
+        results = run_concurrent(
+            self._process_single_judge,
+            items,
+            desc="Judging descriptions",
+            unit="description",
+        )
+        # Update the graph storage with the computed losses
+        for item in results:
+            index = item["index"]
+            loss = item["loss"]
+            if isinstance(index, str):
+                node_id = index
+                node_data = self.graph_storage.get_node(node_id)
+                node_data["loss"] = loss
+                self.graph_storage.update_node(node_id, node_data)
+            elif isinstance(index, tuple):
+                edge_source, edge_target = index
+                edge_data = self.graph_storage.get_edge(edge_source, edge_target)
+                edge_data["loss"] = loss
+                self.graph_storage.update_edge(edge_source, edge_target, edge_data)
+        self.graph_storage.index_done_callback()
diff --git a/graphgen/operators/merge_kg.py b/graphgen/operators/merge_kg.py
deleted file mode 100644
index 33aa1395..00000000
--- a/graphgen/operators/merge_kg.py
+++ /dev/null
@@ -1,215 +0,0 @@
-from collections import Counter
-import asyncio
-from tqdm.asyncio import tqdm as tqdm_async
-
-from graphgen.utils.format import split_string_by_multi_markers
-from graphgen.utils import logger, detect_main_language
-from graphgen.models import TopkTokenModel, Tokenizer
-from graphgen.models.storage.base_storage import BaseGraphStorage
-from graphgen.templates import KG_SUMMARIZATION_PROMPT, KG_EXTRACTION_PROMPT
-
-async def _handle_kg_summary(
-    entity_or_relation_name: str,
-    description: str,
-    llm_client: TopkTokenModel,
-    tokenizer_instance: Tokenizer,
-    max_summary_tokens: int = 200
-) -> str:
-    """
-    处理实体或关系的描述信息
-
-    :param entity_or_relation_name
-    :param description
-    :param llm_client
-    :param tokenizer_instance
-    :param max_summary_tokens
-    :return: new description
-    """
-    language = detect_main_language(description)
-    if language == "en":
-        language = "English"
-    else:
-        language = "Chinese"
-    KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
-
-    tokens = tokenizer_instance.encode_string(description)
-    if len(tokens) <  max_summary_tokens:
-        return description
-
-    use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens])
-    prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
-        entity_name=entity_or_relation_name,
-        description_list=use_description.split('<SEP>'),
-        **KG_SUMMARIZATION_PROMPT["FORMAT"]
-    )
-    new_description = await llm_client.generate_answer(prompt)
-    logger.info("Entity or relation %s summary: %s", entity_or_relation_name, new_description)
-    return new_description
-
-
-async def merge_nodes(
-    nodes_data: dict,
-    kg_instance: BaseGraphStorage,
-    llm_client: TopkTokenModel,
-    tokenizer_instance: Tokenizer,
-    max_concurrent: int = 1000
-):
-    """
-    Merge nodes
-
-    :param nodes_data
-    :param kg_instance
-    :param llm_client
-    :param tokenizer_instance
-    :param max_concurrent
-    :return
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def process_single_node(entity_name: str, node_data: list[dict]):
-        async with semaphore:
-            entity_types = []
-            source_ids = []
-            descriptions = []
-
-            node = await kg_instance.get_node(entity_name)
-            if node is not None:
-                entity_types.append(node["entity_type"])
-                source_ids.extend(
-                    split_string_by_multi_markers(node["source_id"], ['<SEP>'])
-                )
-                descriptions.append(node["description"])
-
-            # 统计当前节点数据和已有节点数据的entity_type出现次数，取出现次数最多的entity_type
-            entity_type = sorted(
-                Counter(
-                    [dp["entity_type"] for dp in node_data] + entity_types
-                ).items(),
-                key=lambda x: x[1],
-                reverse=True,
-            )[0][0]
-
-            description = '<SEP>'.join(
-                sorted(set([dp["description"] for dp in node_data] + descriptions))
-            )
-            description = await _handle_kg_summary(
-                entity_name, description, llm_client, tokenizer_instance
-            )
-
-            source_id = '<SEP>'.join(
-                set([dp["source_id"] for dp in node_data] + source_ids)
-            )
-
-            node_data = {
-                "entity_type": entity_type,
-                "description": description,
-                "source_id": source_id
-            }
-            await kg_instance.upsert_node(
-                entity_name,
-                node_data=node_data
-            )
-            node_data["entity_name"] = entity_name
-            return node_data
-
-    logger.info("Inserting entities into storage...")
-    entities_data = []
-    for result in tqdm_async(
-        asyncio.as_completed(
-            [process_single_node(k, v) for k, v in nodes_data.items()]
-        ),
-        total=len(nodes_data),
-        desc="Inserting entities into storage",
-        unit="entity",
-    ):
-        try:
-            entities_data.append(await result)
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while inserting entities into storage: %s", e)
-
-
-async def merge_edges(
-    edges_data: dict,
-    kg_instance: BaseGraphStorage,
-    llm_client: TopkTokenModel,
-    tokenizer_instance: Tokenizer,
-    max_concurrent: int = 1000
-):
-    """
-    Merge edges
-
-    :param edges_data
-    :param kg_instance
-    :param llm_client
-    :param tokenizer_instance
-    :param max_concurrent
-    :return
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]):
-        async with semaphore:
-            source_ids = []
-            descriptions = []
-
-            edge = await kg_instance.get_edge(src_id, tgt_id)
-            if edge is not None:
-                source_ids.extend(
-                    split_string_by_multi_markers(edge["source_id"], ['<SEP>'])
-                )
-                descriptions.append(edge["description"])
-
-            description = '<SEP>'.join(
-                sorted(set([dp["description"] for dp in edge_data] + descriptions))
-            )
-            source_id = '<SEP>'.join(
-                set([dp["source_id"] for dp in edge_data] + source_ids)
-            )
-
-            for insert_id in [src_id, tgt_id]:
-                if not await kg_instance.has_node(insert_id):
-                    await kg_instance.upsert_node(
-                        insert_id,
-                        node_data={
-                            "source_id": source_id,
-                            "description": description,
-                            "entity_type": "UNKNOWN"
-                        }
-                    )
-
-            description = await _handle_kg_summary(
-                f"({src_id}, {tgt_id})", description, llm_client, tokenizer_instance
-            )
-
-            await kg_instance.upsert_edge(
-                src_id,
-                tgt_id,
-                edge_data={
-                    "source_id": source_id,
-                    "description": description
-                }
-            )
-
-            edge_data = {
-                "src_id": src_id,
-                "tgt_id": tgt_id,
-                "description": description
-            }
-            return edge_data
-
-    logger.info("Inserting relationships into storage...")
-    relationships_data = []
-    for result in tqdm_async(
-        asyncio.as_completed(
-            [process_single_edge(src_id, tgt_id, v) for (src_id, tgt_id), v in edges_data.items()]
-        ),
-        total=len(edges_data),
-        desc="Inserting relationships into storage",
-        unit="relationship",
-    ):
-        try:
-            relationships_data.append(await result)
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while inserting relationships into storage: %s", e)
diff --git a/graphgen/operators/partition/__init__.py b/graphgen/operators/partition/__init__.py
new file mode 100644
index 00000000..8d586b95
--- /dev/null
+++ b/graphgen/operators/partition/__init__.py
@@ -0,0 +1 @@
+from .partition_service import PartitionService
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
new file mode 100644
index 00000000..e56a093e
--- /dev/null
+++ b/graphgen/operators/partition/partition_service.py
@@ -0,0 +1,292 @@
+import json
+import os
+from typing import Iterable
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseOperator, BaseTokenizer
+from graphgen.common import init_storage
+from graphgen.models import (
+    AnchorBFSPartitioner,
+    BFSPartitioner,
+    DFSPartitioner,
+    ECEPartitioner,
+    LeidenPartitioner,
+    Tokenizer,
+)
+from graphgen.utils import logger
+
+
+class PartitionService(BaseOperator):
+    def __init__(self, working_dir: str = "cache", **partition_kwargs):
+        super().__init__(working_dir=working_dir, op_name="partition_service")
+        self.kg_instance: BaseGraphStorage = init_storage(
+            backend="kuzu",
+            working_dir=working_dir,
+            namespace="graph",
+        )
+        self.chunk_storage: BaseKVStorage = init_storage(
+            backend="rocksdb",
+            working_dir=working_dir,
+            namespace="chunk",
+        )
+        tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base")
+        self.tokenizer_instance: BaseTokenizer = Tokenizer(model_name=tokenizer_model)
+        self.partition_kwargs = partition_kwargs
+
+    def process(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
+        # this operator does not consume any batch data
+        # but for compatibility we keep the interface
+        _ = batch.to_dict(orient="records")
+        self.kg_instance.reload()
+        self.chunk_storage.reload()
+
+        yield from self.partition()
+
+    def partition(self) -> Iterable[pd.DataFrame]:
+        method = self.partition_kwargs["method"]
+        method_params = self.partition_kwargs["method_params"]
+        if method == "bfs":
+            logger.info("Partitioning knowledge graph using BFS method.")
+            partitioner = BFSPartitioner()
+        elif method == "dfs":
+            logger.info("Partitioning knowledge graph using DFS method.")
+            partitioner = DFSPartitioner()
+        elif method == "ece":
+            logger.info("Partitioning knowledge graph using ECE method.")
+            # TODO： before ECE partitioning, we need to:
+            # 1. 'quiz' and 'judge' to get the comprehension loss if unit_sampling is not random
+            # 2. pre-tokenize nodes and edges to get the token length
+            self._pre_tokenize()
+            partitioner = ECEPartitioner()
+        elif method == "leiden":
+            logger.info("Partitioning knowledge graph using Leiden method.")
+            partitioner = LeidenPartitioner()
+        elif method == "anchor_bfs":
+            logger.info("Partitioning knowledge graph using Anchor BFS method.")
+            anchor_type = method_params.get("anchor_type")
+            if isinstance(anchor_type, list):
+                logger.info("Using multiple anchor types: %s", anchor_type)
+            else:
+                logger.info("Using single anchor type: %s", anchor_type)
+            partitioner = AnchorBFSPartitioner(
+                anchor_type=anchor_type,
+                anchor_ids=set(method_params.get("anchor_ids", []))
+                if method_params.get("anchor_ids")
+                else None,
+            )
+        else:
+            raise ValueError(f"Unsupported partition method: {method}")
+
+        communities = partitioner.partition(g=self.kg_instance, **method_params)
+
+        for community in communities:
+            batch = partitioner.community2batch(community, g=self.kg_instance)
+            batch = self._attach_additional_data_to_node(batch)
+
+            yield pd.DataFrame(
+                {
+                    "nodes": [batch[0]],
+                    "edges": [batch[1]],
+                }
+            )
+
+    def _pre_tokenize(self) -> None:
+        """Pre-tokenize all nodes and edges to add token length information."""
+        logger.info("Starting pre-tokenization of nodes and edges...")
+
+        nodes = self.kg_instance.get_all_nodes()
+        edges = self.kg_instance.get_all_edges()
+
+        # Process nodes
+        for node_id, node_data in nodes:
+            if "length" not in node_data:
+                try:
+                    description = node_data.get("description", "")
+                    tokens = self.tokenizer_instance.encode(description)
+                    node_data["length"] = len(tokens)
+                    self.kg_instance.update_node(node_id, node_data)
+                except Exception as e:
+                    logger.warning("Failed to tokenize node %s: %s", node_id, e)
+                    node_data["length"] = 0
+
+        # Process edges
+        for u, v, edge_data in edges:
+            if "length" not in edge_data:
+                try:
+                    description = edge_data.get("description", "")
+                    tokens = self.tokenizer_instance.encode(description)
+                    edge_data["length"] = len(tokens)
+                    self.kg_instance.update_edge(u, v, edge_data)
+                except Exception as e:
+                    logger.warning("Failed to tokenize edge %s-%s: %s", u, v, e)
+                    edge_data["length"] = 0
+
+        # Persist changes
+        self.kg_instance.index_done_callback()
+        logger.info("Pre-tokenization completed.")
+
+    def _attach_additional_data_to_node(self, batch: tuple) -> tuple:
+        """
+        Attach additional data from chunk_storage to nodes in the batch.
+        :param batch: tuple of (nodes_data, edges_data)
+        :return: updated batch with additional data attached to nodes
+        """
+        nodes_data, edges_data = batch
+
+        for node_id, node_data in nodes_data:
+            entity_type = (node_data.get("entity_type") or "").lower()
+
+            if not entity_type:
+                continue
+
+            source_ids = [
+                sid.strip()
+                for sid in node_data.get("source_id", "").split("<SEP>")
+                if sid.strip()
+            ]
+
+            if not source_ids:
+                continue
+
+            # Handle images
+            if "image" in entity_type:
+                image_chunks = [
+                    data
+                    for sid in source_ids
+                    if "image" in sid.lower()
+                    and (data := self.chunk_storage.get_by_id(sid))
+                ]
+                if image_chunks:
+                    # The generator expects a dictionary with an 'img_path' key, not a list of captions.
+                    # We'll use the first image chunk found for this node.
+                    node_data["image_data"] = json.loads(image_chunks[0]["content"])
+                    logger.debug("Attached image data to node %s", node_id)
+
+            # Handle omics data (protein/dna/rna)
+            molecule_type = None
+            if entity_type in ("protein", "dna", "rna"):
+                molecule_type = entity_type
+            else:
+                # Infer from source_id prefix
+                for sid in source_ids:
+                    sid_lower = sid.lower()
+                    if sid_lower.startswith("protein-"):
+                        molecule_type = "protein"
+                        break
+                    if sid_lower.startswith("dna-"):
+                        molecule_type = "dna"
+                        break
+                    if sid_lower.startswith("rna-"):
+                        molecule_type = "rna"
+                        break
+
+            if molecule_type:
+                omics_chunks = [
+                    data
+                    for sid in source_ids
+                    if (data := self.chunk_storage.get_by_id(sid))
+                ]
+
+                if not omics_chunks:
+                    logger.warning(
+                        "No chunks found for node %s (type: %s) with source_ids: %s",
+                        node_id, molecule_type, source_ids
+                    )
+                    continue
+
+                def get_chunk_value(chunk: dict, field: str):
+                    # First check root level of chunk
+                    if field in chunk:
+                        return chunk[field]
+                    # Then check metadata if it exists and is a dict
+                    chunk_metadata = chunk.get("metadata")
+                    if isinstance(chunk_metadata, dict) and field in chunk_metadata:
+                        return chunk_metadata[field]
+                    return None
+
+                # Group chunks by molecule type to preserve all types of sequences
+                chunks_by_type = {"dna": [], "rna": [], "protein": []}
+                for chunk in omics_chunks:
+                    chunk_id = chunk.get("_chunk_id", "").lower()
+                    if chunk_id.startswith("dna-"):
+                        chunks_by_type["dna"].append(chunk)
+                    elif chunk_id.startswith("rna-"):
+                        chunks_by_type["rna"].append(chunk)
+                    elif chunk_id.startswith("protein-"):
+                        chunks_by_type["protein"].append(chunk)
+
+                # Field mappings for each molecule type
+                field_mapping = {
+                    "protein": [
+                        "protein_name", "gene_names", "organism", "function",
+                        "sequence", "id", "database", "entry_name", "uniprot_id"
+                    ],
+                    "dna": [
+                        "gene_name", "gene_description", "organism", "chromosome",
+                        "genomic_location", "function", "gene_type", "sequence",
+                        "id", "database"
+                    ],
+                    "rna": [
+                        "rna_type", "description", "organism", "related_genes",
+                        "gene_name", "so_term", "sequence", "id", "database",
+                        "rnacentral_id"
+                    ],
+                }
+
+                # Extract and store captions for each molecule type
+                for mol_type in ["dna", "rna", "protein"]:
+                    type_chunks = chunks_by_type[mol_type]
+                    if not type_chunks:
+                        continue
+
+                    # Use the first chunk of this type
+                    type_chunk = type_chunks[0]
+                    caption = {}
+
+                    # Extract all relevant fields for this molecule type
+                    for field in field_mapping.get(mol_type, []):
+                        value = get_chunk_value(type_chunk, field)
+                        if value:
+                            caption[field] = value
+
+                    # Store caption if it has any data
+                    if caption:
+                        caption_key = f"{mol_type}_caption"
+                        node_data[caption_key] = caption
+                        logger.debug("Stored %s caption for node %s with %d fields", mol_type, node_id, len(caption))
+
+                # For backward compatibility, also attach sequence and other fields from the primary molecule type
+                # Use the detected molecule_type or default to the first available type
+                primary_chunk = None
+                if chunks_by_type.get(molecule_type):
+                    primary_chunk = chunks_by_type[molecule_type][0]
+                elif chunks_by_type["dna"]:
+                    primary_chunk = chunks_by_type["dna"][0]
+                elif chunks_by_type["rna"]:
+                    primary_chunk = chunks_by_type["rna"][0]
+                elif chunks_by_type["protein"]:
+                    primary_chunk = chunks_by_type["protein"][0]
+                else:
+                    primary_chunk = omics_chunks[0]
+
+                # Attach sequence if not already present (for backward compatibility)
+                if "sequence" not in node_data:
+                    sequence = get_chunk_value(primary_chunk, "sequence")
+                    if sequence:
+                        node_data["sequence"] = sequence
+
+                # Attach molecule_type if not present
+                if "molecule_type" not in node_data:
+                    chunk_molecule_type = get_chunk_value(primary_chunk, "molecule_type")
+                    if chunk_molecule_type:
+                        node_data["molecule_type"] = chunk_molecule_type
+
+                # Attach molecule-specific fields from primary chunk (for backward compatibility)
+                for field in field_mapping.get(molecule_type, []):
+                    if field not in node_data:
+                        value = get_chunk_value(primary_chunk, field)
+                        if value:
+                            node_data[field] = value
+
+        return nodes_data, edges_data
diff --git a/graphgen/operators/preprocess/__init__.py b/graphgen/operators/preprocess/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/graphgen/operators/resolute_coreference.py b/graphgen/operators/preprocess/resolute_coreference.py
similarity index 60%
rename from graphgen/operators/resolute_coreference.py
rename to graphgen/operators/preprocess/resolute_coreference.py
index 4a1012fb..a4da6a86 100644
--- a/graphgen/operators/resolute_coreference.py
+++ b/graphgen/operators/preprocess/resolute_coreference.py
@@ -1,12 +1,14 @@
 from typing import List
-from graphgen.models import Chunk
-from graphgen.models import OpenAIModel
-from graphgen.templates import COREFERENCE_RESOLUTION_TEMPLATE
+
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import OpenAIClient
+from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT
 from graphgen.utils import detect_main_language
 
+
 async def resolute_coreference(
-        llm_client: OpenAIModel,
-        chunks: List[Chunk]) -> List[Chunk]:
+    llm_client: OpenAIClient, chunks: List[Chunk]
+) -> List[Chunk]:
     """
     Resolute conference
 
@@ -23,9 +25,8 @@ async def resolute_coreference(
     for _, chunk in enumerate(chunks[1:]):
         language = detect_main_language(chunk.content)
         result = await llm_client.generate_answer(
-            COREFERENCE_RESOLUTION_TEMPLATE[language].format(
-                reference = results[0].content,
-                input_sentence = chunk.content
+            COREFERENCE_RESOLUTION_PROMPT[language].format(
+                reference=results[0].content, input_sentence=chunk.content
             )
         )
         results.append(Chunk(id=chunk.id, content=result))
diff --git a/graphgen/operators/quiz.py b/graphgen/operators/quiz.py
deleted file mode 100644
index 36edddb1..00000000
--- a/graphgen/operators/quiz.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import asyncio
-from collections import defaultdict
-
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import JsonKVStorage, OpenAIModel, NetworkXStorage
-from graphgen.utils import logger, detect_main_language
-from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
-
-
-async def quiz(
-        synth_llm_client: OpenAIModel,
-        graph_storage: NetworkXStorage,
-        rephrase_storage: JsonKVStorage,
-        max_samples: int = 1,
-        max_concurrent: int = 1000) -> JsonKVStorage:
-    """
-    Get all edges and quiz them
-
-    :param synth_llm_client: generate statements
-    :param graph_storage: graph storage instance
-    :param rephrase_storage: rephrase storage instance
-    :param max_samples: max samples for each edge
-    :param max_concurrent: max concurrent
-    :return:
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def _process_single_quiz(
-        des: str,
-        prompt: str,
-        gt: str
-    ):
-        async with semaphore:
-            try:
-                # 如果在rephrase_storage中已经存在，直接取出
-                descriptions = await rephrase_storage.get_by_id(des)
-                if descriptions:
-                    return None
-
-                new_description = await synth_llm_client.generate_answer(
-                    prompt,
-                    temperature=1
-                )
-                return  {des: [(new_description, gt)]}
-
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error when quizzing description %s: %s", des, e)
-                return None
-
-
-    edges = await graph_storage.get_all_edges()
-    nodes = await graph_storage.get_all_nodes()
-
-    results = defaultdict(list)
-    tasks = []
-    for edge in edges:
-        edge_data = edge[2]
-
-        description = edge_data["description"]
-        language = "English" if detect_main_language(description) == "en" else "Chinese"
-
-        results[description] = [(description, 'yes')]
-
-        for i in range(max_samples):
-            if i > 0:
-                tasks.append(
-                    _process_single_quiz(description,
-                                          DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(
-                                              input_sentence=description), 'yes')
-                )
-            tasks.append(_process_single_quiz(description,
-                                              DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(
-                                                  input_sentence=description), 'no'))
-
-    for node in nodes:
-        node_data = node[1]
-        description = node_data["description"]
-        language = "English" if detect_main_language(description) == "en" else "Chinese"
-
-        results[description] = [(description, 'yes')]
-
-        for i in range(max_samples):
-            if i > 0:
-                tasks.append(
-                    _process_single_quiz(description,
-                                          DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(
-                                              input_sentence=description), 'yes')
-                )
-            tasks.append(_process_single_quiz(description,
-                                              DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(
-                                                  input_sentence=description), 'no'))
-
-    for result in tqdm_async(
-            asyncio.as_completed(tasks),
-            total=len(tasks),
-            desc="Quizzing descriptions"
-    ):
-        new_result = await result
-        if new_result:
-            for key, value in new_result.items():
-                results[key].extend(value)
-
-    for key, value in results.items():
-        results[key] = list(set(value))
-        await rephrase_storage.upsert({key: results[key]})
-
-
-    return rephrase_storage
diff --git a/graphgen/operators/quiz/__init__.py b/graphgen/operators/quiz/__init__.py
new file mode 100644
index 00000000..2a931f4b
--- /dev/null
+++ b/graphgen/operators/quiz/__init__.py
@@ -0,0 +1 @@
+from .quiz_service import QuizService
diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py
new file mode 100644
index 00000000..66dc1193
--- /dev/null
+++ b/graphgen/operators/quiz/quiz_service.py
@@ -0,0 +1,112 @@
+from collections.abc import Iterable
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper, BaseOperator
+from graphgen.common import init_llm, init_storage
+from graphgen.models import QuizGenerator
+from graphgen.utils import compute_dict_hash, logger, run_concurrent
+
+
+class QuizService(BaseOperator):
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        quiz_samples: int = 1,
+        concurrency_limit: int = 200,
+    ):
+        super().__init__(working_dir=working_dir, op_name="quiz_service")
+        self.quiz_samples = quiz_samples
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.graph_storage: BaseGraphStorage = init_storage(
+            backend="networkx", working_dir=working_dir, namespace="graph"
+        )
+        # { _quiz_id: { "description": str, "quizzes": List[Tuple[str, str]] } }
+        self.quiz_storage: BaseKVStorage = init_storage(
+            backend="rocksdb", working_dir=working_dir, namespace="quiz"
+        )
+        self.generator = QuizGenerator(self.llm_client)
+        self.concurrency_limit = concurrency_limit
+
+    def process(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
+        # this operator does not consume any batch data
+        # but for compatibility we keep the interface
+        _ = batch.to_dict(orient="records")
+        self.graph_storage.reload()
+        yield from self.quiz()
+
+    async def _process_single_quiz(self, item: tuple) -> dict | None:
+        # if quiz in quiz_storage exists already, directly get it
+        index, desc = item
+        _quiz_id = compute_dict_hash({"index": index, "description": desc})
+        if self.quiz_storage.get_by_id(_quiz_id):
+            return None
+
+        tasks = []
+        for i in range(self.quiz_samples):
+            if i > 0:
+                tasks.append((desc, "TEMPLATE", "yes"))
+            tasks.append((desc, "ANTI_TEMPLATE", "no"))
+        try:
+            quizzes = []
+            for d, template_type, gt in tasks:
+                prompt = self.generator.build_prompt_for_description(d, template_type)
+                new_description = await self.llm_client.generate_answer(
+                    prompt, temperature=1
+                )
+                rephrased_text = self.generator.parse_rephrased_text(new_description)
+                quizzes.append((rephrased_text, gt))
+            return {
+                "_quiz_id": _quiz_id,
+                "description": desc,
+                "index": index,
+                "quizzes": quizzes,
+            }
+        except Exception as e:
+            logger.error("Error when quizzing description %s: %s", item, e)
+            return None
+
+    def quiz(self) -> Iterable[pd.DataFrame]:
+        """
+        Get all nodes and edges and quiz their descriptions using QuizGenerator.
+        """
+        edges = self.graph_storage.get_all_edges()
+        nodes = self.graph_storage.get_all_nodes()
+
+        items = []
+
+        for edge in edges:
+            edge_data = edge[2]
+            desc = edge_data["description"]
+            items.append(((edge[0], edge[1]), desc))
+
+        for node in nodes:
+            node_data = node[1]
+            desc = node_data["description"]
+            items.append((node[0], desc))
+
+        logger.info("Total descriptions to quiz: %d", len(items))
+
+        for i in range(0, len(items), self.concurrency_limit):
+            batch_items = items[i : i + self.concurrency_limit]
+            batch_results = run_concurrent(
+                self._process_single_quiz,
+                batch_items,
+                desc=f"Quizzing descriptions ({i} / {i + len(batch_items)})",
+                unit="description",
+            )
+
+            final_results = []
+            for new_result in batch_results:
+                if new_result:
+                    self.quiz_storage.upsert(
+                        {
+                            new_result["_quiz_id"]: {
+                                "description": new_result["description"],
+                                "quizzes": new_result["quizzes"],
+                            }
+                        }
+                    )
+                    final_results.append(new_result)
+            self.quiz_storage.index_done_callback()
+            yield pd.DataFrame(final_results)
diff --git a/graphgen/operators/read/__init__.py b/graphgen/operators/read/__init__.py
new file mode 100644
index 00000000..cd22453e
--- /dev/null
+++ b/graphgen/operators/read/__init__.py
@@ -0,0 +1 @@
+from .read import read, read_files
diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py
new file mode 100644
index 00000000..84219139
--- /dev/null
+++ b/graphgen/operators/read/parallel_file_scanner.py
@@ -0,0 +1,216 @@
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any, Dict, List, Set, Union
+
+from graphgen.models import RocksDBCache
+
+
+class ParallelFileScanner:
+    def __init__(
+        self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
+    ):
+        self.cache = RocksDBCache(os.path.join(cache_dir, "input_paths.db"))
+        self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
+        self.rescan = rescan
+        self.max_workers = max_workers
+
+    def scan(
+        self, paths: Union[str, List[str]], recursive: bool = True
+    ) -> Dict[str, Any]:
+        if isinstance(paths, str):
+            paths = [paths]
+
+        results = {}
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            future_to_path = {}
+            for p in paths:
+                if os.path.exists(p):
+                    future = executor.submit(
+                        self._scan_files, Path(p).resolve(), recursive, set()
+                    )
+                    future_to_path[future] = p
+
+            for future in as_completed(future_to_path):
+                path = future_to_path[future]
+                try:
+                    results[path] = future.result()
+                except Exception as e:
+                    results[path] = {
+                        "error": str(e),
+                        "files": [],
+                        "dirs": [],
+                        "stats": {},
+                    }
+        return results
+
+    def _scan_files(
+        self, path: Path, recursive: bool, visited: Set[str]
+    ) -> Dict[str, Any]:
+        path_str = str(path)
+
+        # Avoid cycles due to symlinks
+        if path_str in visited:
+            return self._empty_result(path_str)
+
+        # cache check
+        cache_key = f"scan::{path_str}::recursive::{recursive}"
+        cached = self.cache.get(cache_key)
+        if cached and not self.rescan:
+            return cached["data"]
+
+        files, dirs = [], []
+        stats = {"total_size": 0, "file_count": 0, "dir_count": 0, "errors": 0}
+
+        try:
+            path_stat = path.stat()
+            if path.is_file():
+                return self._scan_single_file(path, path_str, path_stat)
+            if path.is_dir():
+                with os.scandir(path_str) as entries:
+                    for entry in entries:
+                        try:
+                            entry_stat = entry.stat(follow_symlinks=False)
+
+                            if entry.is_dir():
+                                dirs.append(
+                                    {
+                                        "path": entry.path,
+                                        "name": entry.name,
+                                        "mtime": entry_stat.st_mtime,
+                                    }
+                                )
+                                stats["dir_count"] += 1
+                            else:
+                                # allowed suffix filter
+                                if not self._is_allowed_file(Path(entry.path)):
+                                    continue
+                                files.append(
+                                    {
+                                        "path": entry.path,
+                                        "name": entry.name,
+                                        "size": entry_stat.st_size,
+                                        "mtime": entry_stat.st_mtime,
+                                    }
+                                )
+                                stats["total_size"] += entry_stat.st_size
+                                stats["file_count"] += 1
+
+                        except OSError:
+                            stats["errors"] += 1
+
+        except (PermissionError, FileNotFoundError, OSError) as e:
+            return {"error": str(e), "files": [], "dirs": [], "stats": stats}
+
+        if recursive:
+            sub_visited = visited | {path_str}
+            sub_results = self._scan_subdirs(dirs, sub_visited)
+
+            for sub_data in sub_results.values():
+                files.extend(sub_data.get("files", []))
+                stats["total_size"] += sub_data["stats"].get("total_size", 0)
+                stats["file_count"] += sub_data["stats"].get("file_count", 0)
+
+        result = {"path": path_str, "files": files, "dirs": dirs, "stats": stats}
+        self._cache_result(cache_key, result, path)
+        return result
+
+    def _scan_single_file(
+        self, path: Path, path_str: str, stat: os.stat_result
+    ) -> Dict[str, Any]:
+        """Scan a single file and return its metadata"""
+        if not self._is_allowed_file(path):
+            return self._empty_result(path_str)
+
+        return {
+            "path": path_str,
+            "files": [
+                {
+                    "path": path_str,
+                    "name": path.name,
+                    "size": stat.st_size,
+                    "mtime": stat.st_mtime,
+                }
+            ],
+            "dirs": [],
+            "stats": {
+                "total_size": stat.st_size,
+                "file_count": 1,
+                "dir_count": 0,
+                "errors": 0,
+            },
+        }
+
+    def _scan_subdirs(self, dir_list: List[Dict], visited: Set[str]) -> Dict[str, Any]:
+        """
+        Parallel scan subdirectories
+        :param dir_list
+        :param visited
+        :return:
+        """
+        results = {}
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {
+                executor.submit(self._scan_files, Path(d["path"]), True, visited): d[
+                    "path"
+                ]
+                for d in dir_list
+            }
+
+            for future in as_completed(futures):
+                path = futures[future]
+                try:
+                    results[path] = future.result()
+                except Exception as e:
+                    results[path] = {
+                        "error": str(e),
+                        "files": [],
+                        "dirs": [],
+                        "stats": {},
+                    }
+
+        return results
+
+    def _cache_result(self, key: str, result: Dict, path: Path):
+        """Cache the scan result"""
+        self.cache.set(
+            key,
+            {
+                "data": result,
+                "dir_mtime": path.stat().st_mtime,
+                "cached_at": time.time(),
+            },
+        )
+
+    def _is_allowed_file(self, path: Path) -> bool:
+        """Check if the file has an allowed suffix"""
+        if self.allowed_suffix is None:
+            return True
+        suffix = path.suffix.lower().lstrip(".")
+        return suffix in self.allowed_suffix
+
+    def invalidate(self, path: str):
+        """Invalidate cache for a specific path"""
+        path = Path(path).resolve()
+        keys = [k for k in self.cache if k.startswith(f"scan::{path}")]
+        for k in keys:
+            self.cache.delete(k)
+
+    def close(self):
+        self.cache.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+    @staticmethod
+    def _empty_result(path: str) -> Dict[str, Any]:
+        return {
+            "path": path,
+            "files": [],
+            "dirs": [],
+            "stats": {"total_size": 0, "file_count": 0, "dir_count": 0, "errors": 0},
+        }
diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
new file mode 100644
index 00000000..0f23160f
--- /dev/null
+++ b/graphgen/operators/read/read.py
@@ -0,0 +1,199 @@
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Union
+
+import ray
+
+from graphgen.models import (
+    CSVReader,
+    JSONReader,
+    ParquetReader,
+    PDFReader,
+    PickleReader,
+    RDFReader,
+    TXTReader,
+)
+from graphgen.utils import compute_mm_hash, logger
+
+from .parallel_file_scanner import ParallelFileScanner
+
+_MAPPING = {
+    "jsonl": JSONReader,
+    "json": JSONReader,
+    "txt": TXTReader,
+    "csv": CSVReader,
+    "md": TXTReader,
+    "pdf": PDFReader,
+    "parquet": ParquetReader,
+    "pickle": PickleReader,
+    "rdf": RDFReader,
+    "owl": RDFReader,
+    "ttl": RDFReader,
+}
+
+
+def _build_reader(suffix: str, cache_dir: str | None, **reader_kwargs):
+    """Factory function to build appropriate reader instance"""
+    suffix = suffix.lower()
+    reader_cls = _MAPPING.get(suffix)
+    if not reader_cls:
+        raise ValueError(f"Unsupported file suffix: {suffix}")
+
+    # Special handling for PDFReader which needs output_dir
+    if suffix == "pdf":
+        if cache_dir is None:
+            raise ValueError("cache_dir must be provided for PDFReader")
+        return reader_cls(output_dir=cache_dir, **reader_kwargs)
+
+    return reader_cls(**reader_kwargs)
+
+
+def read(
+    input_path: Union[str, List[str]],
+    allowed_suffix: Optional[List[str]] = None,
+    cache_dir: Optional[str] = "cache",
+    parallelism: int = 4,
+    recursive: bool = True,
+    **reader_kwargs: Any,
+) -> ray.data.Dataset:
+    """
+    Unified entry point to read files of multiple types using Ray Data.
+
+    :param input_path: File or directory path(s) to read from
+    :param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
+    :param cache_dir: Directory to cache intermediate files (PDF processing)
+    :param parallelism: Number of parallel workers
+    :param recursive: Whether to scan directories recursively
+    :param reader_kwargs: Additional kwargs passed to readers
+    :return: Ray Dataset containing all documents
+    """
+    try:
+        # 1. Scan all paths to discover files
+        logger.info("[READ] Scanning paths: %s", input_path)
+        scanner = ParallelFileScanner(
+            cache_dir=cache_dir,
+            allowed_suffix=allowed_suffix,
+            rescan=False,
+            max_workers=parallelism if parallelism > 0 else 1,
+        )
+
+        all_files = []
+        scan_results = scanner.scan(input_path, recursive=recursive)
+
+        for result in scan_results.values():
+            all_files.extend(result.get("files", []))
+
+        logger.info("[READ] Found %d files to process", len(all_files))
+
+        if not all_files:
+            raise ValueError("No files found to read.")
+
+        # 2. Group files by suffix to use appropriate reader
+        files_by_suffix = {}
+        for file_info in all_files:
+            suffix = Path(file_info["path"]).suffix.lower().lstrip(".")
+            if allowed_suffix and suffix not in [
+                s.lower().lstrip(".") for s in allowed_suffix
+            ]:
+                continue
+            files_by_suffix.setdefault(suffix, []).append(file_info["path"])
+
+        # 3. Create read tasks
+        read_tasks = []
+        for suffix, file_paths in files_by_suffix.items():
+            reader = _build_reader(suffix, cache_dir, **reader_kwargs)
+            ds = reader.read(file_paths)
+            read_tasks.append(ds)
+
+        # 4. Combine all datasets
+        if not read_tasks:
+            raise ValueError("No datasets created from the provided files.")
+
+        if len(read_tasks) == 1:
+            combined_ds = read_tasks[0]
+        else:
+            combined_ds = read_tasks[0].union(*read_tasks[1:])
+
+        combined_ds = combined_ds.map(
+            lambda record: {
+                **record,
+                "_doc_id": compute_mm_hash(record, prefix="doc-"),
+            }
+        )
+
+        logger.info("[READ] Successfully read files from %s", input_path)
+        return combined_ds
+
+    except Exception as e:
+        logger.error("[READ] Failed to read files from %s: %s", input_path, e)
+        raise
+
+
+def read_files(
+    input_file: str,
+    allowed_suffix: Optional[List[str]] = None,
+    cache_dir: Optional[str] = None,
+    max_workers: int = 4,
+    rescan: bool = False,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Read files from a path using parallel scanning and appropriate readers.
+    Returns an iterator for streaming (backward compatibility with graphgen.py).
+
+    Args:
+        input_file: Path to a file or directory
+        allowed_suffix: List of file suffixes to read. If None, uses all supported types
+        cache_dir: Directory for caching PDF extraction and scan results
+        max_workers: Number of workers for parallel scanning
+        rescan: Whether to force rescan even if cached results exist
+
+    Returns:
+        Iterator of dictionaries containing the data (for streaming)
+    """
+    path = Path(input_file).expanduser()
+    if not path.exists():
+        raise FileNotFoundError(f"input_path not found: {input_file}")
+
+    if allowed_suffix is None:
+        support_suffix = set(_MAPPING.keys())
+    else:
+        support_suffix = {s.lower().lstrip(".") for s in allowed_suffix}
+
+    with ParallelFileScanner(
+        cache_dir=cache_dir or "cache",
+        allowed_suffix=support_suffix,
+        rescan=rescan,
+        max_workers=max_workers,
+    ) as scanner:
+        scan_results = scanner.scan(str(path), recursive=True)
+
+    # Extract files from scan results
+    files_to_read = []
+    for path_result in scan_results.values():
+        if "error" in path_result:
+            logger.warning("Error scanning %s: %s", path_result.path, path_result.error)
+            continue
+        files_to_read.extend(path_result.get("files", []))
+
+    logger.info(
+        "Found %d eligible file(s) under folder %s (allowed_suffix=%s)",
+        len(files_to_read),
+        input_file,
+        support_suffix,
+    )
+
+    for file_info in files_to_read:
+        try:
+            file_path = file_info["path"]
+            suffix = Path(file_path).suffix.lstrip(".").lower()
+            reader = _build_reader(suffix, cache_dir)
+
+            # Prefer stream reading if available (for memory efficiency)
+            if hasattr(reader, "read_stream"):
+                yield from reader.read_stream(file_path)
+            else:
+                # Fallback to regular read() method - convert Ray Dataset to iterator
+                ds = reader.read([file_path])
+                yield from ds.iter_rows()
+
+        except Exception as e:  # pylint: disable=broad-except
+            logger.exception("Error reading %s: %s", file_info.get("path"), e)
diff --git a/graphgen/operators/search/__init__.py b/graphgen/operators/search/__init__.py
new file mode 100644
index 00000000..47144c77
--- /dev/null
+++ b/graphgen/operators/search/__init__.py
@@ -0,0 +1 @@
+from .search_service import SearchService
diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py
new file mode 100644
index 00000000..fbe10f06
--- /dev/null
+++ b/graphgen/operators/search/multi_omics_search.py
@@ -0,0 +1,29 @@
+import re
+from typing import Dict, Optional
+
+from graphgen.models import UniProtSearch
+
+
+def _fetch_uniprot(entry: str) -> Optional[Dict]:
+    entry = entry.strip()
+    client = UniProtSearch()
+
+    # 1. first try accession search
+    if re.fullmatch(
+        r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry
+    ):
+        return client.get_by_accession(entry)
+
+    # 2. then try keyword search
+    return client.get_best_hit(entry)
+
+
+def multi_omics_search(entry: str) -> Dict:
+    """
+    Multi-omics search function that tries to fetch protein/gene information.
+    """
+    # TODO: Extend this function to include more omics databases as needed.
+    result = _fetch_uniprot(entry)
+    if result:
+        return {"input": entry, "uniprot": result}
+    return {"input": entry, "uniprot": None}
diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py
new file mode 100644
index 00000000..a948b612
--- /dev/null
+++ b/graphgen/operators/search/search_service.py
@@ -0,0 +1,412 @@
+"""
+To use Google Web Search API,
+follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
+to get your Google searcher api key.
+
+To use Bing Web Search API,
+follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
+and obtain your Bing subscription key.
+"""
+
+import pandas as pd
+
+from graphgen.bases import BaseOperator
+from graphgen.utils import create_event_loop, run_concurrent
+
+
+class SearchService(BaseOperator):
+    """
+    Service class for performing searches across multiple data sources.
+    Provides search functionality for UniProt, NCBI, and RNAcentral databases.
+    """
+
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        data_sources: list = None,
+        ncbi_params: dict = None,
+        uniprot_params: dict = None,
+        rnacentral_params: dict = None,
+        save_interval: int = 1000,
+        **kwargs,
+    ):
+        super().__init__(working_dir=working_dir, op_name="search_service")
+        self.working_dir = working_dir
+
+        # Build search_config dictionary from parameters
+        self.search_config = {
+            "data_sources": data_sources or [],
+        }
+
+        if ncbi_params:
+            self.search_config["ncbi_params"] = ncbi_params
+        if uniprot_params:
+            self.search_config["uniprot_params"] = uniprot_params
+        if rnacentral_params:
+            self.search_config["rnacentral_params"] = rnacentral_params
+
+        self.save_interval = save_interval
+        self.search_storage = None  # Optional: can be initialized if needed for saving intermediate results
+
+    async def _perform_searches(self, seed_data: dict) -> dict:
+        """
+        Internal method to perform searches across multiple search types and aggregate the results.
+        :param seed_data: A dictionary containing seed data with entity names.
+        :return: A dictionary with search results
+        """
+        results = {}
+        data_sources = self.search_config.get("data_sources", [])
+
+        for data_source in data_sources:
+            data = list(seed_data.values())
+            data = [d["content"] for d in data if "content" in d]
+            data = list(set(data))  # Remove duplicates
+
+            # Prepare save callback for this data source
+            def make_save_callback(source_name):
+                def save_callback(intermediate_results, completed_count):
+                    """Save intermediate search results."""
+                    if self.search_storage is None:
+                        return
+
+                    # Convert results list to dict format
+                    # Results are tuples of (query, result_dict) or just result_dict
+                    batch_results = {}
+                    for result in intermediate_results:
+                        if result is None:
+                            continue
+                        # Check if result is a dict with _search_query key
+                        if isinstance(result, dict) and "_search_query" in result:
+                            query = result["_search_query"]
+                            # Create a key for the result (using query as key)
+                            key = f"{source_name}:{query}"
+                            batch_results[key] = result
+                        elif isinstance(result, dict):
+                            # If no _search_query, use a generated key
+                            key = f"{source_name}:{completed_count}"
+                            batch_results[key] = result
+
+                    if batch_results:
+                        # Filter out already existing keys
+                        new_keys = self.search_storage.filter_keys(list(batch_results.keys()))
+                        new_results = {k: v for k, v in batch_results.items() if k in new_keys}
+                        if new_results:
+                            self.search_storage.upsert(new_results)
+                            self.search_storage.index_done_callback()
+                            self.logger.debug("Saved %d intermediate results for %s", len(new_results), source_name)
+
+                return save_callback
+
+            if data_source == "uniprot":
+                from graphgen.models import UniProtSearch
+
+                uniprot_params = self.search_config.get("uniprot_params", {}).copy()
+                # Get max_concurrent from config before passing params to constructor
+                max_concurrent = uniprot_params.pop("max_concurrent", None)
+
+                uniprot_search_client = UniProtSearch(
+                    working_dir=self.working_dir,
+                    **uniprot_params
+                )
+
+                uniprot_results = await run_concurrent(
+                    uniprot_search_client.search,
+                    data,
+                    desc="Searching UniProt database",
+                    unit="keyword",
+                    save_interval=self.save_interval if self.save_interval > 0 else 0,
+                    save_callback=(
+                        make_save_callback("uniprot")
+                        if self.search_storage and self.save_interval > 0
+                        else None
+                    ),
+                    max_concurrent=max_concurrent,
+                )
+                results[data_source] = uniprot_results
+
+            elif data_source == "ncbi":
+                from graphgen.models import NCBISearch
+
+                ncbi_params = self.search_config.get("ncbi_params", {}).copy()
+                # Get max_concurrent from config before passing params to constructor
+                max_concurrent = ncbi_params.pop("max_concurrent", None)
+
+                ncbi_search_client = NCBISearch(
+                    working_dir=self.working_dir,
+                    **ncbi_params
+                )
+
+                ncbi_results = await run_concurrent(
+                    ncbi_search_client.search,
+                    data,
+                    desc="Searching NCBI database",
+                    unit="keyword",
+                    save_interval=self.save_interval if self.save_interval > 0 else 0,
+                    save_callback=(
+                        make_save_callback("ncbi")
+                        if self.search_storage and self.save_interval > 0
+                        else None
+                    ),
+                    max_concurrent=max_concurrent,
+                )
+                results[data_source] = ncbi_results
+
+            elif data_source == "rnacentral":
+                from graphgen.models import RNACentralSearch
+
+                rnacentral_params = self.search_config.get("rnacentral_params", {}).copy()
+                # Get max_concurrent from config before passing params to constructor
+                max_concurrent = rnacentral_params.pop("max_concurrent", None)
+
+                rnacentral_search_client = RNACentralSearch(
+                    working_dir=self.working_dir,
+                    **rnacentral_params
+                )
+
+                rnacentral_results = await run_concurrent(
+                    rnacentral_search_client.search,
+                    data,
+                    desc="Searching RNAcentral database",
+                    unit="keyword",
+                    save_interval=self.save_interval if self.save_interval > 0 else 0,
+                    save_callback=(
+                        make_save_callback("rnacentral")
+                        if self.search_storage and self.save_interval > 0
+                        else None
+                    ),
+                    max_concurrent=max_concurrent,
+                )
+                results[data_source] = rnacentral_results
+
+            else:
+                self.logger.error("Data source %s not supported.", data_source)
+                continue
+
+        return results
+
+    def _is_already_searched(self, doc: dict) -> bool:
+        """
+        Check if a document already contains search results.
+
+        :param doc: Document dictionary
+        :return: True if document appears to already contain search results
+        """
+        # Check for data_source field (added by search_service)
+        if "data_source" in doc and doc["data_source"]:
+            return True
+
+        # Check for database field (added by search operations)
+        if "database" in doc and doc["database"] in ["UniProt", "NCBI", "RNAcentral"]:
+            # Also check for molecule_type to confirm it's a search result
+            if "molecule_type" in doc and doc["molecule_type"] in ["DNA", "RNA", "protein"]:
+                return True
+
+        # Check for search-specific fields that indicate search results
+        search_indicators = [
+            "uniprot_id", "entry_name",  # UniProt
+            "gene_id", "gene_name", "chromosome",  # NCBI
+            "rnacentral_id", "rna_type",  # RNAcentral
+        ]
+        if any(indicator in doc for indicator in search_indicators):
+            # Make sure it's not just metadata by checking for database or molecule_type
+            if "database" in doc or "molecule_type" in doc:
+                return True
+
+        return False
+
+    def _normalize_searched_data(self, doc: dict) -> dict:
+        """
+        Normalize a document that already contains search results to the expected format.
+
+        :param doc: Document dictionary with search results
+        :return: Normalized document dictionary
+        """
+        # Ensure required fields exist
+        doc_id = doc.get("_doc_id")
+        if not doc_id:
+            # Generate doc_id from id or other fields
+            raw_doc_id = doc.get("id") or doc.get("_search_query") or f"doc-{hash(str(doc))}"
+            doc_id = str(raw_doc_id)
+
+        # Ensure doc_id starts with "doc-" prefix
+        if not doc_id.startswith("doc-"):
+            doc_id = f"doc-{doc_id}"
+
+        # Determine document type from molecule_type or existing type
+        doc_type = doc.get("type", "text")
+        if doc_type == "text" and "molecule_type" in doc:
+            molecule_type = doc.get("molecule_type", "").lower()
+            if molecule_type in ["dna", "rna", "protein"]:
+                doc_type = molecule_type
+
+        # Ensure data_source field exists
+        data_source = doc.get("data_source")
+        if not data_source:
+            # Infer from database field
+            database = doc.get("database", "").lower()
+            if "uniprot" in database:
+                data_source = "uniprot"
+            elif "ncbi" in database:
+                data_source = "ncbi"
+            elif "rnacentral" in database or "rna" in database:
+                data_source = "rnacentral"
+
+        # Build or preserve content field
+        content = doc.get("content")
+        if not content or content.strip() == "":
+            # Build content from available fields if missing
+            content_parts = []
+            if doc.get("title"):
+                content_parts.append(f"Title: {doc['title']}")
+            if doc.get("description"):
+                content_parts.append(f"Description: {doc['description']}")
+            if doc.get("function"):
+                func = doc["function"]
+                if isinstance(func, list):
+                    func = ", ".join(str(f) for f in func)
+                content_parts.append(f"Function: {func}")
+            if doc.get("sequence"):
+                content_parts.append(f"Sequence: {doc['sequence']}")
+
+            if not content_parts:
+                # Fallback: create content from key fields
+                key_fields = ["protein_name", "gene_name", "gene_description", "organism"]
+                for field in key_fields:
+                    if field in doc and doc[field]:
+                        content_parts.append(f"{field}: {doc[field]}")
+
+            content = "\n".join(content_parts) if content_parts else str(doc)
+
+        # Create normalized row
+        normalized_doc = {
+            "_doc_id": doc_id,
+            "type": doc_type,
+            "content": content,
+            "data_source": data_source,
+            **doc,  # Include all original fields for metadata
+        }
+
+        return normalized_doc
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Process a batch of documents and perform searches.
+        This is the Ray Data operator interface.
+
+        If input data already contains search results (detected by presence of
+        data_source, database, or search-specific fields), the search step is
+        skipped and the data is normalized and returned directly.
+
+        :param batch: DataFrame containing documents with at least '_doc_id' and 'content' columns
+        :return: DataFrame containing search results
+        """
+        # Convert DataFrame to dictionary format
+        docs = batch.to_dict(orient="records")
+
+        # Check if data already contains search results
+        already_searched = all(self._is_already_searched(doc) for doc in docs if doc)
+
+        if already_searched:
+            # Data already contains search results, normalize and return directly
+            self.logger.info(
+                "Input data already contains search results. "
+                "Skipping search step and normalizing data."
+            )
+            result_rows = []
+            for doc in docs:
+                if not doc:
+                    continue
+                normalized_doc = self._normalize_searched_data(doc)
+                result_rows.append(normalized_doc)
+
+            if not result_rows:
+                self.logger.warning("No documents found in batch")
+                return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"])
+
+            return pd.DataFrame(result_rows)
+
+        # Data doesn't contain search results, perform search as usual
+        seed_data = {doc.get("_doc_id", f"doc-{i}"): doc for i, doc in enumerate(docs)}
+
+        # Perform searches asynchronously
+        loop, created = create_event_loop()
+        try:
+            if loop.is_running():
+                # If loop is already running, we can't use run_until_complete
+                # This shouldn't happen in normal usage, but handle it gracefully
+                raise RuntimeError(
+                    "Cannot use process when event loop is already running. "
+                    "This is likely a Ray worker configuration issue."
+                )
+            search_results = loop.run_until_complete(
+                self._perform_searches(seed_data)
+            )
+        finally:
+            # Only close the loop if we created it
+            if created:
+                loop.close()
+
+        # Convert search_results from {data_source: [results]} to DataFrame
+        # Each result becomes a document row compatible with chunk service
+        result_rows = []
+
+        for data_source, result_list in search_results.items():
+            if not isinstance(result_list, list):
+                continue
+
+            for result in result_list:
+                if result is None:
+                    continue
+
+                # Convert search result to document format expected by chunk service
+                # Build content from available fields
+                content_parts = []
+                if result.get("title"):
+                    content_parts.append(f"Title: {result['title']}")
+                if result.get("description"):
+                    content_parts.append(f"Description: {result['description']}")
+                if result.get("function"):
+                    content_parts.append(f"Function: {result['function']}")
+                if result.get("sequence"):
+                    content_parts.append(f"Sequence: {result['sequence']}")
+
+                # If no content parts, use a default or combine all fields
+                if not content_parts:
+                    # Fallback: create content from all string fields
+                    content_parts = [
+                        f"{k}: {v}"
+                        for k, v in result.items()
+                        if isinstance(v, (str, int, float)) and k != "_search_query"
+                    ]
+
+                content = "\n".join(content_parts) if content_parts else str(result)
+
+                # Determine document type from molecule_type or default to "text"
+                doc_type = result.get("molecule_type", "text").lower()
+                if doc_type not in ["text", "dna", "rna", "protein"]:
+                    doc_type = "text"
+
+                # Convert to string to handle Ray Data ListElement and other types
+                raw_doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}"
+                doc_id = str(raw_doc_id)
+
+                # Ensure doc_id starts with "doc-" prefix
+                if not doc_id.startswith("doc-"):
+                    doc_id = f"doc-{doc_id}"
+
+                # Create document row with all result fields plus required fields
+                row = {
+                    "_doc_id": doc_id,
+                    "type": doc_type,
+                    "content": content,
+                    "data_source": data_source,
+                    **result,  # Include all original result fields for metadata
+                }
+                result_rows.append(row)
+
+        if not result_rows:
+            self.logger.warning("No search results generated for this batch")
+            # Return empty DataFrame with expected structure
+            return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"])
+
+        return pd.DataFrame(result_rows)
diff --git a/graphgen/operators/search_wikipedia.py b/graphgen/operators/search_wikipedia.py
deleted file mode 100644
index d3d7e283..00000000
--- a/graphgen/operators/search_wikipedia.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import asyncio
-from graphgen.models import WikiSearch, OpenAIModel
-from graphgen.models.storage.base_storage import BaseGraphStorage
-from graphgen.templates import SEARCH_JUDGEMENT_PROMPT
-from graphgen.utils import logger
-
-
-async def _process_single_entity(entity_name: str,
-                                 description: str,
-                                 llm_client: OpenAIModel,
-                                 wiki_search_client: WikiSearch) -> tuple[str, None] | tuple[str, str]:
-    """
-    Process single entity
-
-    """
-    search_results = await wiki_search_client.search(entity_name)
-    if not search_results:
-        return entity_name, None
-    examples = "\n".join(SEARCH_JUDGEMENT_PROMPT["EXAMPLES"])
-    search_results.append("None of the above")
-
-    search_results_str = "\n".join([f"{i + 1}. {sr}" for i, sr in enumerate(search_results)])
-    prompt = SEARCH_JUDGEMENT_PROMPT["TEMPLATE"].format(
-        examples=examples,
-        entity_name=entity_name,
-        description=description,
-        search_results=search_results_str,
-    )
-    response = await llm_client.generate_answer(prompt)
-
-    try:
-        response = response.strip()
-        response = int(response)
-        if response < 1 or response >= len(search_results):
-            response = None
-        else:
-            response = await wiki_search_client.summary(search_results[response - 1])
-    except ValueError:
-        response = None
-
-    logger.info("Entity %s search result: %s response: %s", entity_name, str(search_results), response)
-
-    return entity_name, response
-
-async def search_wikipedia(llm_client: OpenAIModel,
-                           wiki_search_client: WikiSearch,
-                           knowledge_graph_instance: BaseGraphStorage,) -> dict:
-    """
-    Search wikipedia for entities
-
-    :param llm_client: LLM model
-    :param wiki_search_client: wiki search client
-    :param knowledge_graph_instance: knowledge graph instance
-    :return: nodes with search results
-    """
-
-
-    nodes = await knowledge_graph_instance.get_all_nodes()
-    nodes = list(nodes)
-    wiki_data = {}
-
-    tasks = [
-        _process_single_entity(node[0].strip('"'), node[1]["description"], llm_client, wiki_search_client)
-        for node in nodes
-    ]
-
-    for task in asyncio.as_completed(tasks):
-        result = await task
-        wiki_data[result[0]] = result[1]
-
-    return wiki_data
diff --git a/graphgen/operators/split_graph.py b/graphgen/operators/split_graph.py
deleted file mode 100644
index e2e2b5ca..00000000
--- a/graphgen/operators/split_graph.py
+++ /dev/null
@@ -1,333 +0,0 @@
-import random
-from collections import defaultdict
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.utils import logger
-
-from graphgen.models import NetworkXStorage, TraverseStrategy
-
-async def _get_node_info(
-    node_id: str,
-    graph_storage: NetworkXStorage,
-)-> dict:
-    """
-    Get node info
-
-    :param node_id: node id
-    :param graph_storage: graph storage instance
-    :return: node info
-    """
-    node_data = await graph_storage.get_node(node_id)
-    return {
-        "node_id": node_id,
-        **node_data
-    }
-
-
-def _get_level_n_edges_by_max_width(
-    edge_adj_list: dict,
-    node_dict: dict,
-    edges: list,
-    nodes,
-    src_edge: tuple,
-    max_depth: int,
-    bidirectional: bool,
-    max_extra_edges: int,
-    edge_sampling: str,
-    loss_strategy: str = "only_edge"
-) -> list:
-    """
-    Get level n edges for an edge.
-    n is decided by max_depth in traverse_strategy
-
-    :param edge_adj_list
-    :param node_dict
-    :param edges
-    :param nodes
-    :param src_edge
-    :param max_depth
-    :param bidirectional
-    :param max_extra_edges
-    :param edge_sampling
-    :return: level n edges
-    """
-    src_id, tgt_id, _ = src_edge
-
-    level_n_edges = []
-
-    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
-
-    while max_depth > 0 and max_extra_edges > 0:
-        max_depth -= 1
-
-        candidate_edges = [
-            edges[edge_id]
-            for node in start_nodes
-            for edge_id in edge_adj_list[node]
-            if not edges[edge_id][2].get("visited", False)
-        ]
-
-        if not candidate_edges:
-            break
-
-        if len(candidate_edges) >= max_extra_edges:
-            if loss_strategy == "both":
-                er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
-                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[:max_extra_edges]
-            elif loss_strategy == "only_edge":
-                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[:max_extra_edges]
-            else:
-                raise ValueError(f"Invalid loss strategy: {loss_strategy}")
-
-            for edge in candidate_edges:
-                level_n_edges.append(edge)
-                edge[2]["visited"] = True
-            break
-
-        max_extra_edges -= len(candidate_edges)
-        new_start_nodes = set()
-
-        for edge in candidate_edges:
-            level_n_edges.append(edge)
-            edge[2]["visited"] = True
-
-            if not edge[0] in start_nodes:
-                new_start_nodes.add(edge[0])
-            if not edge[1] in start_nodes:
-                new_start_nodes.add(edge[1])
-
-        start_nodes = new_start_nodes
-
-    return level_n_edges
-
-
-def _get_level_n_edges_by_max_tokens(
-        edge_adj_list: dict,
-        node_dict: dict,
-        edges: list,
-        nodes: list,
-        src_edge: tuple,
-        max_depth: int,
-        bidirectional: bool,
-        max_tokens: int,
-        edge_sampling: str,
-        loss_strategy: str = "only_edge"
-) -> list:
-    """
-    Get level n edges for an edge.
-    n is decided by max_depth in traverse_strategy.
-
-    :param edge_adj_list
-    :param node_dict
-    :param edges
-    :param nodes
-    :param src_edge
-    :param max_depth
-    :param bidirectional
-    :param max_tokens
-    :param edge_sampling
-    :return: level n edges
-    """
-    src_id, tgt_id, src_edge_data = src_edge
-
-    max_tokens -= (src_edge_data["length"] + nodes[node_dict[src_id]][1]["length"]
-                   + nodes[node_dict[tgt_id]][1]["length"])
-
-    level_n_edges = []
-
-    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
-    temp_nodes = {src_id, tgt_id}
-
-    while max_depth > 0 and max_tokens > 0:
-        max_depth -= 1
-
-        candidate_edges = [
-            edges[edge_id]
-            for node in start_nodes
-            for edge_id in edge_adj_list[node]
-            if not edges[edge_id][2].get("visited", False)
-        ]
-
-        if not candidate_edges:
-            break
-
-        if loss_strategy == "both":
-            er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
-            candidate_edges = _sort_tuples(er_tuples, edge_sampling)
-        elif loss_strategy == "only_edge":
-            candidate_edges = _sort_edges(candidate_edges, edge_sampling)
-        else:
-            raise ValueError(f"Invalid loss strategy: {loss_strategy}")
-
-        for edge in candidate_edges:
-            max_tokens -= edge[2]["length"]
-            if not edge[0] in temp_nodes:
-                max_tokens -= nodes[node_dict[edge[0]]][1]["length"]
-            if not edge[1] in temp_nodes:
-                max_tokens -= nodes[node_dict[edge[1]]][1]["length"]
-
-            if max_tokens < 0:
-                return level_n_edges
-
-            level_n_edges.append(edge)
-            edge[2]["visited"] = True
-            temp_nodes.add(edge[0])
-            temp_nodes.add(edge[1])
-
-        new_start_nodes = set()
-        for edge in candidate_edges:
-            if not edge[0] in start_nodes:
-                new_start_nodes.add(edge[0])
-            if not edge[1] in start_nodes:
-                new_start_nodes.add(edge[1])
-
-        start_nodes = new_start_nodes
-
-    return level_n_edges
-
-
-def _sort_tuples(er_tuples: list, edge_sampling: str) -> list:
-    """
-    Sort edges with edge sampling strategy
-
-    :param er_tuples: [(nodes:list, edge:tuple)]
-    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
-    :return: sorted edges
-    """
-    if edge_sampling == "random":
-        er_tuples = random.sample(er_tuples, len(er_tuples))
-    elif edge_sampling == "min_loss":
-        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"])
-    elif edge_sampling == "max_loss":
-        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
-                           reverse=True)
-    else:
-        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
-    edges = [edge for _, edge in er_tuples]
-    return edges
-
-def _sort_edges(edges: list, edge_sampling: str) -> list:
-    """
-    Sort edges with edge sampling strategy
-
-    :param edges: total edges
-    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
-    :return: sorted edges
-    """
-    if edge_sampling == "random":
-        random.shuffle(edges)
-    elif edge_sampling == "min_loss":
-        edges = sorted(edges, key=lambda x: x[2]["loss"])
-    elif edge_sampling == "max_loss":
-        edges = sorted(edges, key=lambda x: x[2]["loss"], reverse=True)
-    else:
-        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
-    return edges
-
-async def get_batches_with_strategy( # pylint: disable=too-many-branches
-    nodes: list,
-    edges: list,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: TraverseStrategy
-):
-    expand_method = traverse_strategy.expand_method
-    if expand_method == "max_width":
-        logger.info("Using max width strategy")
-    elif expand_method == "max_tokens":
-        logger.info("Using max tokens strategy")
-    else:
-        raise ValueError(f"Invalid expand method: {expand_method}")
-
-    max_depth = traverse_strategy.max_depth
-    edge_sampling = traverse_strategy.edge_sampling
-
-    # 构建临接矩阵
-    edge_adj_list = defaultdict(list)
-    node_dict = {}
-    processing_batches = []
-
-    node_cache = {}
-
-    async def get_cached_node_info(node_id: str) -> dict:
-        if node_id not in node_cache:
-            node_cache[node_id] = await _get_node_info(node_id, graph_storage)
-        return node_cache[node_id]
-
-    for i, (node_name, _) in enumerate(nodes):
-        node_dict[node_name] = i
-
-    if traverse_strategy.loss_strategy == "both":
-        er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in edges]
-        edges = _sort_tuples(er_tuples, edge_sampling)
-    elif traverse_strategy.loss_strategy == "only_edge":
-        edges = _sort_edges(edges, edge_sampling)
-    else:
-        raise ValueError(f"Invalid loss strategy: {traverse_strategy.loss_strategy}")
-
-    for i, (src, tgt, _) in enumerate(edges):
-        edge_adj_list[src].append(i)
-        edge_adj_list[tgt].append(i)
-
-    for edge in tqdm_async(edges, desc="Preparing batches"):
-        if "visited" in edge[2] and edge[2]["visited"]:
-            continue
-
-        edge[2]["visited"] = True
-
-        _process_nodes = []
-        _process_edges = []
-
-        src_id = edge[0]
-        tgt_id = edge[1]
-
-        _process_nodes.extend([await get_cached_node_info(src_id),
-                               await get_cached_node_info(tgt_id)])
-        _process_edges.append(edge)
-
-        if expand_method == "max_width":
-            level_n_edges = _get_level_n_edges_by_max_width(
-                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
-                traverse_strategy.bidirectional, traverse_strategy.max_extra_edges,
-                edge_sampling, traverse_strategy.loss_strategy
-            )
-        else:
-            level_n_edges = _get_level_n_edges_by_max_tokens(
-                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
-                traverse_strategy.bidirectional, traverse_strategy.max_tokens,
-                edge_sampling, traverse_strategy.loss_strategy
-            )
-
-        for _edge in level_n_edges:
-            _process_nodes.append(await get_cached_node_info(_edge[0]))
-            _process_nodes.append(await get_cached_node_info(_edge[1]))
-            _process_edges.append(_edge)
-
-        # 去重
-        _process_nodes = list({node['node_id']: node for node in _process_nodes}.values())
-        _process_edges = list({(edge[0], edge[1]): edge for edge in _process_edges}.values())
-
-        processing_batches.append((_process_nodes, _process_edges))
-
-    logger.info("Processing batches: %d", len(processing_batches))
-
-    # isolate nodes
-    isolated_node_strategy = traverse_strategy.isolated_node_strategy
-    if isolated_node_strategy == "add":
-        processing_batches = await _add_isolated_nodes(nodes, processing_batches, graph_storage)
-        logger.info("Processing batches after adding isolated nodes: %d", len(processing_batches))
-
-    return processing_batches
-
-async def _add_isolated_nodes(
-        nodes: list,
-        processing_batches: list,
-        graph_storage: NetworkXStorage,
-) -> list:
-    visited_nodes = set()
-    for _process_nodes, _process_edges in processing_batches:
-        for node in _process_nodes:
-            visited_nodes.add(node["node_id"])
-    for node in nodes:
-        if node[0] not in visited_nodes:
-            _process_nodes = [await _get_node_info(node[0], graph_storage)]
-            processing_batches.append((_process_nodes, []))
-    return processing_batches
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
deleted file mode 100644
index 947033ed..00000000
--- a/graphgen/operators/traverse_graph.py
+++ /dev/null
@@ -1,485 +0,0 @@
-import asyncio
-import gradio as gr
-
-from tqdm.asyncio import tqdm as tqdm_async
-
-from graphgen.models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage
-from graphgen.templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT
-from graphgen.utils import detect_main_language, compute_content_hash, logger
-from graphgen.operators.split_graph import get_batches_with_strategy
-
-
-async def _pre_tokenize(graph_storage: NetworkXStorage,
-                        tokenizer: Tokenizer,
-                        edges: list,
-                        nodes: list) -> tuple:
-
-    sem = asyncio.Semaphore(1000)
-    async def handle_edge(edge: tuple) -> tuple:
-        async with sem:
-            if 'length' not in edge[2]:
-                edge[2]['length'] = len(
-                    await asyncio.get_event_loop().run_in_executor(None,
-                                                                   tokenizer.encode_string,
-                                                                   edge[2]['description']))
-            return edge
-
-    async def handle_node(node: dict) -> dict:
-        async with sem:
-            if 'length' not in node[1]:
-                node[1]['length'] = len(
-                    await asyncio.get_event_loop().run_in_executor(None,
-                                                                   tokenizer.encode_string,
-                                                                   node[1]['description']))
-            return node
-
-    new_edges = []
-    new_nodes = []
-
-    for result in tqdm_async(asyncio.as_completed([handle_edge(edge) for edge in edges]),
-                             total=len(edges), desc="Pre-tokenizing edges"):
-        new_edge = await result
-        await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2])
-        new_edges.append(new_edge)
-
-    for result in tqdm_async(asyncio.as_completed([handle_node(node) for node in nodes]),
-                             total=len(nodes), desc="Pre-tokenizing nodes"):
-        new_node = await result
-        await graph_storage.update_node(new_node[0], new_node[1])
-        new_nodes.append(new_node)
-
-    await graph_storage.index_done_callback()
-    return new_edges, new_nodes
-
-async def _construct_rephrasing_prompt(_process_nodes: list,
-                                       _process_edges: list,
-                                       text_chunks_storage: JsonKVStorage,
-                                       add_context: bool = False
-                                       ) -> str:
-    entities = [
-        f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
-    ]
-    relations = [
-        f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
-        for _process_edge in _process_edges
-    ]
-
-    entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
-    relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
-    language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
-
-    if add_context:
-        original_ids = ([node['source_id'].split('<SEP>')[0] for node in _process_nodes] +
-                        [edge[2]['source_id'].split('<SEP>')[0] for edge in _process_edges])
-
-        original_ids = list(set(original_ids))
-        original_text = await text_chunks_storage.get_by_ids(original_ids)
-        original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)])
-
-        prompt = ANSWER_REPHRASING_PROMPT[language]['CONTEXT_TEMPLATE'].format(
-            language=language,
-            original_text=original_text,
-            entities=entities_str,
-            relationships=relations_str
-        )
-        return prompt
-
-    prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format(
-        language=language,
-        entities=entities_str,
-        relationships=relations_str
-    )
-    return prompt
-
-def get_loss_tercile(losses: list) -> (float, float):
-    losses = sorted(losses)
-    q1_index = int(len(losses) * (1 / 3))
-    q2_index = int(len(losses) * (2 / 3))
-
-    return losses[q1_index], losses[q2_index]
-
-def get_average_loss(batch: tuple, loss_strategy: str) -> float:
-    if loss_strategy == "only_edge":
-        return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
-    if loss_strategy == "both":
-        return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \
-               (len(batch[0]) + len(batch[1]))
-    raise ValueError("Invalid loss strategy")
-
-def _post_process_synthetic_data(data):
-    block = data.split("\n\n")
-    qas = []
-    for line in block:
-        if "Question:" in line and "Answer:" in line:
-            question = line.split("Question:")[1].split("Answer:")[0].strip()
-            answer = line.split("Answer:")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
-        elif "问题：" in line and "答案：" in line:
-            question = line.split("问题：")[1].split("答案：")[0].strip()
-            answer = line.split("答案：")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
-        elif "问题:" in line and "回答:" in line:
-            question = line.split("问题:")[1].split("回答:")[0].strip()
-            answer = line.split("回答:")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
-    return qas
-
-async def traverse_graph_by_edge(
-    llm_client: OpenAIModel,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: TraverseStrategy,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
-) -> dict:
-    """
-    Traverse the graph
-
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    async def _process_nodes_and_edges(
-            _process_nodes: list,
-            _process_edges: list,
-    ) -> str:
-        prompt = await _construct_rephrasing_prompt(
-            _process_nodes,
-            _process_edges,
-            text_chunks_storage,
-            add_context = False
-        )
-        context = await llm_client.generate_answer(prompt)
-
-        # post-process the context
-        if context.startswith("Rephrased Text:"):
-            context = context[len("Rephrased Text:"):].strip()
-        elif context.startswith("重述文本:"):
-            context = context[len("重述文本:"):].strip()
-
-        return context
-
-    async def _process_single_batch(
-        _process_batch: tuple,
-        question_type: str = "single"
-    ) -> dict:
-        async with semaphore:
-            context = await _process_nodes_and_edges(
-                _process_batch[0],
-                _process_batch[1],
-            )
-
-            language = "Chinese" if detect_main_language(context) == "zh" else "English"
-            pre_length = sum(node['length'] for node in _process_batch[0]) \
-                         + sum(edge[2]['length'] for edge in _process_batch[1])
-
-            if question_type == "single":
-                question = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format(
-                        answer=context
-                    )
-                )
-                if question.startswith("Question:"):
-                    question = question[len("Question:"):].strip()
-                elif question.startswith("问题："):
-                    question = question[len("问题："):].strip()
-
-                logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
-                logger.info("Pre-length: %s", pre_length)
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", context)
-
-                return {
-                    compute_content_hash(context): {
-                        "question": question,
-                        "answer": context,
-                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
-                    }
-                }
-
-            content = await llm_client.generate_answer(
-                QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format(
-                    doc=context
-                )
-            )
-            qas = _post_process_synthetic_data(content)
-
-            if len(qas) == 0:
-                print(content)
-                logger.error("Error occurred while processing batch, question or answer is None")
-                return {}
-
-            final_results = {}
-            logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
-            logger.info("Pre-length: %s", pre_length)
-            for qa in qas:
-                logger.info("Question: %s", qa['question'])
-                logger.info("Answer: %s", qa['answer'])
-                final_results[compute_content_hash(qa['question'])] = {
-                    "question": qa['question'],
-                    "answer": qa['answer'],
-                    "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
-                }
-            return final_results
-
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-
-    processing_batches = await get_batches_with_strategy(
-        nodes,
-        edges,
-        graph_storage,
-        traverse_strategy
-    )
-
-    for result in tqdm_async(asyncio.as_completed(
-        [_process_single_batch(batch) for batch in processing_batches]
-    ), total=len(processing_batches), desc="[4/4]Generating QAs"):
-        try:
-            if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(processing_batches):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-
-    return results
-
-
-async def traverse_graph_atomically(
-    llm_client: OpenAIModel,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: TraverseStrategy,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
-) -> dict:
-    """
-    Traverse the graph atomicly
-
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-    assert traverse_strategy.qa_form == "atomic"
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-    async def _generate_question(
-        node_or_edge: tuple
-    ):
-        if len(node_or_edge) == 2:
-            des = node_or_edge[0] + ": " + node_or_edge[1]['description']
-            loss = node_or_edge[1]['loss']
-        else:
-            des = node_or_edge[2]['description']
-            loss = node_or_edge[2]['loss']
-
-        async with semaphore:
-            try:
-                language = "Chinese" if detect_main_language(des) == "zh" else "English"
-
-                qa = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]['SINGLE_QA_TEMPLATE'].format(
-                        doc=des
-                    )
-                )
-
-                if "Question:" in qa and "Answer:" in qa:
-                    question = qa.split("Question:")[1].split("Answer:")[0].strip()
-                    answer = qa.split("Answer:")[1].strip()
-                elif "问题：" in qa and "答案：" in qa:
-                    question = qa.split("问题：")[1].split("答案：")[0].strip()
-                    answer = qa.split("答案：")[1].strip()
-                else:
-                    return {}
-
-                question = question.strip("\"")
-                answer = answer.strip("\"")
-
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", answer)
-                return {
-                    compute_content_hash(question): {
-                        "question": question,
-                        "answer": answer,
-                        "loss": loss
-                    }
-                }
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error occurred while generating question: %s", e)
-                return {}
-
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-
-    tasks = []
-    for node in nodes:
-        if "<SEP>" in node[1]['description']:
-            description_list = node[1]['description'].split("<SEP>")
-            for item in description_list:
-                tasks.append((node[0], {"description": item, 'loss': node[1]['loss']}))
-        else:
-            tasks.append((node[0], node[1]))
-    for edge in edges:
-        if "<SEP>" in edge[2]['description']:
-            description_list = edge[2]['description'].split("<SEP>")
-            for item in description_list:
-                tasks.append((edge[0], edge[1], {"description": item, 'loss': edge[2]['loss']}))
-        else:
-            tasks.append((edge[0], edge[1], edge[2]))
-
-    for result in tqdm_async(
-        asyncio.as_completed([_generate_question(task) for task in tasks]),
-        total=len(tasks),
-        desc="[4/4]Generating QAs"
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(len(results) / len(tasks), desc="[4/4]Generating QAs")
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(tasks):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-    return results
-
-async def traverse_graph_for_multi_hop(
-    llm_client: OpenAIModel,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: TraverseStrategy,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
-) -> dict:
-    """
-    Traverse the graph for multi-hop
-
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-    assert traverse_strategy.qa_form == "multi_hop"
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-
-    processing_batches = await get_batches_with_strategy(
-        nodes,
-        edges,
-        graph_storage,
-        traverse_strategy
-    )
-
-    async def _process_single_batch(
-        _process_batch: tuple
-    ) -> dict:
-        async with semaphore:
-            try:
-                language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English"
-
-                _process_nodes = _process_batch[0]
-                _process_edges = _process_batch[1]
-
-                entities = [
-                    f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
-                ]
-
-                relations = [
-                    f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
-                    for _process_edge in _process_edges
-                ]
-
-                entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
-                relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
-
-                prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
-                    entities=entities_str,
-                    relationships=relations_str
-                )
-
-                context = await llm_client.generate_answer(prompt)
-
-                # post-process the context
-                if "Question:" in context and "Answer:" in context:
-                    question = context.split("Question:")[1].split("Answer:")[0].strip()
-                    answer = context.split("Answer:")[1].strip()
-                elif "问题：" in context and "答案：" in context:
-                    question = context.split("问题：")[1].split("答案：")[0].strip()
-                    answer = context.split("答案：")[1].strip()
-                else:
-                    return {}
-
-                question = question.strip("\"")
-                answer = answer.strip("\"")
-
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", answer)
-
-                return {
-                    compute_content_hash(question): {
-                        "question": question,
-                        "answer": answer,
-                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy),
-                    }
-                }
-
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error occurred while processing batch: %s", e)
-                return {}
-
-    async for result in tqdm_async(
-        asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
-        total=len(processing_batches),
-        desc="[4/4]Generating QAs"
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(processing_batches):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-    return results
diff --git a/graphgen/run.py b/graphgen/run.py
new file mode 100644
index 00000000..b56f79c6
--- /dev/null
+++ b/graphgen/run.py
@@ -0,0 +1,151 @@
+import argparse
+import logging
+import os
+import sys
+import time
+from importlib import resources
+from typing import Any, Dict
+
+import ray
+import yaml
+from dotenv import load_dotenv
+from ray.data.block import Block
+from ray.data.datasource.filename_provider import FilenameProvider
+
+from graphgen.engine import Engine
+from graphgen.operators import operators
+from graphgen.utils import CURRENT_LOGGER_VAR, logger, set_logger
+
+sys_path = os.path.abspath(os.path.dirname(__file__))
+
+load_dotenv()
+
+# Suppress non-error output temporarily
+# Save original streams for restoration
+_original_stdout = sys.stdout
+_original_stderr = sys.stderr
+_devnull = None
+
+
+def set_working_dir(folder):
+    os.makedirs(folder, exist_ok=True)
+
+
+def save_config(config_path, global_config):
+    if not os.path.exists(os.path.dirname(config_path)):
+        os.makedirs(os.path.dirname(config_path))
+    with open(config_path, "w", encoding="utf-8") as config_file:
+        yaml.dump(
+            global_config, config_file, default_flow_style=False, allow_unicode=True
+        )
+
+
+class NodeFilenameProvider(FilenameProvider):
+    def __init__(self, node_id: str):
+        self.node_id = node_id
+
+    def get_filename_for_block(
+        self, block: Block, write_uuid: str, task_index: int, block_index: int
+    ) -> str:
+        # format: {node_id}_{write_uuid}_{task_index:06}_{block_index:06}.json
+        return f"{self.node_id}_{write_uuid}_{task_index:06d}_{block_index:06d}.jsonl"
+
+    def get_filename_for_row(
+        self,
+        row: Dict[str, Any],
+        write_uuid: str,
+        task_index: int,
+        block_index: int,
+        row_index: int,
+    ) -> str:
+        raise NotImplementedError(
+            f"Row-based filenames are not supported by write_json. "
+            f"Node: {self.node_id}, write_uuid: {write_uuid}"
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_file",
+        help="Config parameters for GraphGen.",
+        default=resources.files("graphgen")
+        .joinpath("configs")
+        .joinpath("aggregated_config.yaml"),
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    with open(args.config_file, "r", encoding="utf-8") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+
+    working_dir = config.get("global_params", {}).get("working_dir", "cache")
+    unique_id = int(time.time())
+    output_path = os.path.join(working_dir, "output", f"{unique_id}")
+    set_working_dir(output_path)
+    log_path = os.path.join(working_dir, "logs", "Driver.log")
+    driver_logger = set_logger(
+        log_path,
+        name="GraphGen",
+        console_level=logging.ERROR,
+        if_stream=True,
+    )
+    CURRENT_LOGGER_VAR.set(driver_logger)
+    logger.info(
+        "GraphGen with unique ID %s logging to %s",
+        unique_id,
+        log_path,
+    )
+
+    # Temporarily suppress non-error output (print statements, third-party libraries, Ray Data progress)
+    # Only redirect stdout to preserve stderr for logger error output
+    global _devnull
+    _devnull = open(os.devnull, 'w', encoding='utf-8')
+    sys.stdout = _devnull
+
+    try:
+        engine = Engine(config, operators)
+        ds = ray.data.from_items([])
+        results = engine.execute(ds)
+
+        for node_id, dataset in results.items():
+            node_output_path = os.path.join(output_path, f"{node_id}")
+            os.makedirs(node_output_path, exist_ok=True)
+            dataset.write_json(
+                node_output_path,
+                filename_provider=NodeFilenameProvider(node_id),
+                pandas_json_args_fn=lambda: {
+                    "force_ascii": False,
+                    "orient": "records",
+                    "lines": True,
+                },
+            )
+            logger.info("Node %s results saved to %s", node_id, node_output_path)
+
+        save_config(os.path.join(output_path, "config.yaml"), config)
+        logger.info("GraphGen completed successfully. Data saved to %s", output_path)
+    finally:
+        # Restore original stdout before printing results
+        sys.stdout = _original_stdout
+        if _devnull:
+            _devnull.close()
+            _devnull = None
+
+        # Print save information to console
+        if 'results' in locals() and results:
+            print("\n" + "="*60)
+            print("GraphGen execution completed successfully!")
+            print("="*60)
+            for node_id, dataset in results.items():
+                node_output_path = os.path.join(output_path, f"{node_id}")
+                print(f"✓ Node '{node_id}' results saved to: {node_output_path}")
+            print(f"✓ Config saved to: {os.path.join(output_path, 'config.yaml')}")
+            print(f"✓ Logs saved to: {log_path}")
+            print("="*60 + "\n")
+        else:
+            print("\n⚠️  Warning: No results were generated.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
index 6e362d08..99e297ee 100644
--- a/graphgen/templates/__init__.py
+++ b/graphgen/templates/__init__.py
@@ -1,9 +1,20 @@
-from .kg_extraction import KG_EXTRACTION_PROMPT
-from .kg_summarization import KG_SUMMARIZATION_PROMPT
-from .search_judgement import SEARCH_JUDGEMENT_PROMPT
+from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
-from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
-from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
+from .extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
+from .generation import (
+    AGGREGATED_GENERATION_PROMPT,
+    ATOMIC_GENERATION_PROMPT,
+    COT_GENERATION_PROMPT,
+    MULTI_HOP_GENERATION_PROMPT,
+    OMICS_QA_GENERATION_PROMPT,
+    VQA_GENERATION_PROMPT,
+)
+from .kg import (
+    KG_EXTRACTION_PROMPT,
+    KG_SUMMARIZATION_PROMPT,
+    MMKG_EXTRACTION_PROMPT,
+    OMICS_KG_EXTRACTION_PROMPT,
+)
 from .question_generation import QUESTION_GENERATION_PROMPT
-from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
-from .coreference_resolution import COREFERENCE_RESOLUTION_TEMPLATE
+from .search_judgement import SEARCH_JUDGEMENT_PROMPT
+from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
diff --git a/graphgen/templates/coreference_resolution.py b/graphgen/templates/coreference_resolution.py
index b29394ad..bc03e671 100644
--- a/graphgen/templates/coreference_resolution.py
+++ b/graphgen/templates/coreference_resolution.py
@@ -1,4 +1,3 @@
-# pylint: disable=C0301
 TEMPLATE_ZH: str = """请根据参考文本识别并消解文本中的指代词，明确每个代词所指代的具体实体，并直接输出消解后的文本。
 
 -示例-
@@ -16,7 +15,8 @@
 输出：
 """
 
-TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, specify the specific entities referred to by each pronoun, and directly output the resolved text.
+TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, \
+specify the specific entities referred to by each pronoun, and directly output the resolved text.
 
 -Example-
 Input:
@@ -33,7 +33,4 @@
 Output:
 """
 
-COREFERENCE_RESOLUTION_TEMPLATE = {
-    "en": TEMPLATE_EN,
-    "zh": TEMPLATE_ZH
-}
+COREFERENCE_RESOLUTION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
diff --git a/graphgen/templates/description_rephrasing.py b/graphgen/templates/description_rephrasing.py
index a0e38012..5c4dc861 100644
--- a/graphgen/templates/description_rephrasing.py
+++ b/graphgen/templates/description_rephrasing.py
@@ -19,7 +19,7 @@
 ################
 -Real Data-
 ################
-Input: 
+Input:
 {input_sentence}
 ################
 Please directly output the rewritten sentence without any additional information.
@@ -110,11 +110,11 @@
 
 
 DESCRIPTION_REPHRASING_PROMPT= {
-    "English": {
+    "en": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
         "TEMPLATE": TEMPLATE_EN
     },
-    "Chinese": {
+    "zh": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
         "TEMPLATE": TEMPLATE_ZH
     }
diff --git a/graphgen/templates/extraction/__init__.py b/graphgen/templates/extraction/__init__.py
new file mode 100644
index 00000000..70f1369a
--- /dev/null
+++ b/graphgen/templates/extraction/__init__.py
@@ -0,0 +1 @@
+from .schema_guided_extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
diff --git a/graphgen/templates/extraction/schema_guided_extraction.py b/graphgen/templates/extraction/schema_guided_extraction.py
new file mode 100644
index 00000000..e96c3886
--- /dev/null
+++ b/graphgen/templates/extraction/schema_guided_extraction.py
@@ -0,0 +1,70 @@
+TEMPLATE_EN = """You are an expert at extracting information from text based on a given schema.
+Extract relevant information about {field} from a given contract document according to the provided schema.
+
+Instructions:
+1. Carefully read the entire document provided at the end of this prompt.
+2. Extract the relevant information.
+3. Present your findings in JSON format as specified below.
+
+Important Notes:
+- Extract only relevant information.
+- Consider the context of the entire document when determining relevance.
+- Do not be verbose, only respond with the correct format and information.
+- Some docs may have multiple relevant excerpts -- include all that apply.
+- Some questions may have no relevant excerpts -- just return "".
+- Do not include additional JSON keys beyond the ones listed here.
+- Do not include the same key multiple times in the JSON.
+- Use English for your response.
+
+Expected JSON keys and explanation of what they are:
+{schema_explanation}
+
+Expected format:
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+
+{examples}
+
+Document to extract from:
+{text}
+"""
+
+TEMPLATE_ZH = """你是一个擅长根据给定的模式从文本中提取信息的专家。
+根据提供的模式，从合同文件中提取与{field}相关的信息。
+操作说明：
+1. 仔细阅读本提示末尾提供的整份文件。
+2. 提取相关信息。
+3. 按照下面指定的JSON格式呈现你的发现。
+
+重要注意事项：
+- 仅提取相关信息。
+- 在确定相关性时，考虑整份文件的上下文。
+- 不要冗长，只需以正确的格式和信息进行回应。
+- 有些文件可能有多个相关摘录——请包含所有适用的内容。
+- 有些问题可能没有相关摘录——只需返回""。
+- 不要在JSON中包含除列出的键之外的其他键。
+- 不要多次包含同一个键。
+- 使用中文回答。
+
+预期的JSON键及其说明：
+{schema_explanation}
+
+预期格式：
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+
+{examples}
+要提取的文件：
+{text}
+"""
+
+SCHEMA_GUIDED_EXTRACTION_PROMPT = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+}
diff --git a/graphgen/templates/extraction/schemas/legal_contract.json b/graphgen/templates/extraction/schemas/legal_contract.json
new file mode 100644
index 00000000..00b8baca
--- /dev/null
+++ b/graphgen/templates/extraction/schemas/legal_contract.json
@@ -0,0 +1,48 @@
+{
+  "type": "object",
+  "name": "legal contract",
+  "description": "A legal contract for leasing property.",
+  "properties": {
+      "leased_space": {
+          "type": "string",
+          "description": "Description of the space that is being leased."
+      },
+      "lessee": {
+          "type": "string",
+          "description": "The lessee's name (and possibly address)."
+      },
+      "lessor": {
+          "type": "string",
+          "description": "The lessor's name (and possibly address)."
+      },
+      "signing_date": {
+          "type": "string",
+          "description": "The date the contract was signed."
+      },
+      "start_date": {
+          "type": "string",
+          "description": "The start date of the lease."
+      },
+      "end_date": {
+          "type": "string",
+          "description": "The end date of the lease."
+      },
+      "term_of_payment": {
+          "type": "string",
+          "description": "Description of the payment terms."
+      },
+      "designated_use": {
+          "type": "string",
+          "description": "Designated use of the property being leased."
+      },
+      "extension_period": {
+          "type": "string",
+          "description": "Description of the extension options for the lease."
+      },
+      "expiration_date_of_lease": {
+          "type": "string",
+          "description": "The expiration date of the lease."
+      }
+  },
+    "required": ["lessee", "lessor", "start_date", "end_date"]
+}
diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py
new file mode 100644
index 00000000..22c3797a
--- /dev/null
+++ b/graphgen/templates/generation/__init__.py
@@ -0,0 +1,6 @@
+from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
+from .atomic_generation import ATOMIC_GENERATION_PROMPT
+from .cot_generation import COT_GENERATION_PROMPT
+from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
+from .omics_qa_generation import OMICS_QA_GENERATION_PROMPT
+from .vqa_generation import VQA_GENERATION_PROMPT
diff --git a/graphgen/templates/answer_rephrasing.py b/graphgen/templates/generation/aggregated_generation.py
similarity index 83%
rename from graphgen/templates/answer_rephrasing.py
rename to graphgen/templates/generation/aggregated_generation.py
index a33e9d9e..305064e7 100644
--- a/graphgen/templates/answer_rephrasing.py
+++ b/graphgen/templates/generation/aggregated_generation.py
@@ -1,7 +1,7 @@
-TEMPLATE_CONTEXT_EN: str = """---Role---
-
+# pylint: disable=C0301
+ANSWER_REPHRASING_CONTEXT_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements.
-Use {language} as output language.
+Use English as output language.
 
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -50,13 +50,11 @@
 
 """
 
-TEMPLATE_CONTEXT_ZH: str = """---角色---
-
+ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成，但需要确保最终输出的文本符合要求。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 ---目标---
-
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
 1. 遵循清晰的逻辑流和结构
 2. 建立适当的因果关系
@@ -100,10 +98,9 @@
 
 """
 
-TEMPLATE_EN: str = """---Role---
-
+ANSWER_REPHRASING_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below.
-Use {language} as output language.
+Use English as output language.
 
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -147,13 +144,11 @@
 
 """
 
-TEMPLATE_ZH: str = """---角色---
-
+ANSWER_REPHRASING_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 ---目标---
-
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
 1. 遵循清晰的逻辑流和结构
 2. 建立适当的因果关系
@@ -206,14 +201,33 @@
 Rephrased Text:
 """
 
+QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
+
+################
+Answer:
+{answer}
+################
+Question:
+"""
+
+QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
+
+################
+答案：
+{answer}
+################
+问题：
+"""
 
-ANSWER_REPHRASING_PROMPT= {
-    "English": {
-        "TEMPLATE": TEMPLATE_EN + REQUIREMENT_EN,
-        "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_EN + REQUIREMENT_EN
+AGGREGATED_GENERATION_PROMPT = {
+    "en": {
+        "ANSWER_REPHRASING": ANSWER_REPHRASING_EN + REQUIREMENT_EN,
+        "ANSWER_REPHRASING_CONTEXT": ANSWER_REPHRASING_CONTEXT_EN + REQUIREMENT_EN,
+        "QUESTION_GENERATION": QUESTION_GENERATION_EN,
+    },
+    "zh": {
+        "ANSWER_REPHRASING": ANSWER_REPHRASING_ZH + REQUIREMENT_ZH,
+        "ANSWER_REPHRASING_CONTEXT": ANSWER_REPHRASING_CONTEXT_ZH + REQUIREMENT_ZH,
+        "QUESTION_GENERATION": QUESTION_GENERATION_ZH,
     },
-    "Chinese": {
-        "TEMPLATE": TEMPLATE_ZH + REQUIREMENT_ZH,
-        "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_ZH + REQUIREMENT_ZH
-    }
 }
diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py
new file mode 100644
index 00000000..141c40e4
--- /dev/null
+++ b/graphgen/templates/generation/atomic_generation.py
@@ -0,0 +1,32 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
+The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
+For example:
+Question: What is the effect of overexpressing the BG1 gene on grain size and development?
+Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
+
+Question: What role does TAC4 play in the gravitropism of rice shoots?
+Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector.
+
+Here is the text passage you need to generate a QA pair for:
+{context}
+"""
+
+TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答（QA）对。
+答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
+例如：
+问题：过表达BG1基因对谷粒大小和发育有什么影响？
+答案：BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。
+
+问题：TAC4在水稻茎的重力性状中扮演什么角色？
+答案：TAC4是水稻茎重力性状的关键调节因子，促进茎向重力矢量弯曲。
+
+以下是你需要为其生成QA对的文本段落：
+{context}
+"""
+
+
+ATOMIC_GENERATION_PROMPT = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+}
diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py
new file mode 100644
index 00000000..e179a704
--- /dev/null
+++ b/graphgen/templates/generation/cot_generation.py
@@ -0,0 +1,196 @@
+COT_GENERATION_ZH = """根据给定的知识图谱原始信息及已生成的推理路径，产出一条符合模板要求、可直接用于下游训练或推理的 CoT 数据。\
+CoT（Chain-of-Thought，思维链）指在回答复杂问题时，把中间推理步骤一步一步显式写出来，使推理过程透明、可追溯，而不是直接给出最终答案。
+
+-输入格式-
+[Entities:]
+(实体名:实体描述)
+...
+
+[Relationships:]
+(来源实体)-[关系描述]->(目标实体)
+...
+
+[Question and Reasoning Path:]
+(问题)
+(推理路径)
+
+-输出要求-
+1. 每一步只完成一个不可分割的子任务，并用自然语言衔接，但是要避免生硬的连接词。
+2. 使用中文。
+3. 不要使用有序列表或编号。
+4. 请直接给出答案，不要生成无关信息。
+
+-真实数据-
+输入:
+[Entities:]:
+{entities}
+
+[Relationships:]:
+{relationships}
+
+[Question:]:
+{question}
+
+[Reasoning_Template:]:
+{reasoning_template}
+
+输出：
+
+"""
+
+COT_GENERATION_EN = """Given the raw knowledge graph information and the provided reasoning-path, \
+produce one Chain-of-Thought (CoT) sample that strictly follows the template \
+and can be directly used for downstream training or inference.
+CoT (Chain-of-Thought) means that when answering a complex question, the intermediate reasoning steps are \
+explicitly written out one by one, making the reasoning process transparent and traceable instead of giving \
+only the final answer.
+
+-Input Format-
+[Entities:]:
+(ENTITY_NAME: ENTITY_DESCRIPTION)
+...
+
+[Relationships:]:
+(ENTITY_SOURCE)-[RELATIONSHIP_DESCRIPTION]->(ENTITY_TARGET)
+...
+
+[Question and Reasoning Path:]:
+(QUESTION)
+(REASONING_PATH)
+
+-Output Requirements-
+1. Each step completes a single, indivisible sub-task and is naturally connected, avoiding abrupt transition words.
+2. Use English.
+3. Do not use ordered lists or numbering.
+4. Do not generate extraneous information, just provide the answer.
+
+-Real Data-
+Input:
+[Entities:]:
+{entities}
+
+[Relationships:]:
+{relationships}
+
+[Question:]:
+{question}
+
+[Reasoning_Template:]:
+{reasoning_template}
+
+Output:
+"""
+
+COT_TEMPLATE_DESIGN_ZH = """你是一位“元推理架构师”。你的任务不是回答问题，\
+而是根据给定的知识图谱中的实体和关系的名称以及描述信息，设计一条可复用、可泛化的 CoT 推理路径模板。\
+
+-步骤-
+1. 实体识别
+- 准确地识别[Entities:]章节中的实体信息，包括实体名、实体描述信息。
+- 实体信息的一般格式为:
+(实体名:实体描述)
+
+2. 关系识别
+- 准确地识别[Relationships:]章节中的关系信息，包括来源实体名、目标实体名、关系描述信息。
+- 关系信息的一般格式为:
+(来源实体名)-[关系描述]->(目标实体名)
+
+3. 图结构理解
+- 正确地将关系信息中的来源实体名与实体信息关联。
+- 根据提供的关系信息还原出图结构。
+
+4. 问题设计
+- 围绕知识图谱所表达的“核心主题”设计一个问题。
+- 问题必须能在图谱内部通过实体、关系或属性直接验证；避免主观判断。
+- 问题应该能够模型足够的思考，充分利用图谱中的实体和关系，避免过于简单或无关的问题。
+
+5. 推理路径生成
+- 根据问题设计一个**可被后续模型直接执行的推理蓝图**。
+- 保持步骤最小化：每一步只解决一个“不可分割”的子问题。
+
+-约束条件-
+1. 不要在回答中描述你的思考过程，直接给出回复，只给出问题和推理路径设计，不要生成无关信息。
+2. 如果提供的描述信息相互矛盾，请解决矛盾并提供一个单一、连贯的逻辑。
+3. 避免使用停用词和过于常见的词汇。
+4. 不要出现具体数值或结论，不要出现“识别实体”、“识别关系”这类无意义的操作描述。
+5. 使用中文作为输出语言。
+6. 输出格式为：
+问题：
+推理路径设计：
+
+-真实数据-
+输入:
+[Entities:]:
+{entities}
+
+[Relationships:]:
+{relationships}
+
+输出:
+"""
+
+
+COT_TEMPLATE_DESIGN_EN = """You are a “meta-reasoning architect”. \
+Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
+template based solely on the names and descriptions of entities and \
+relationships in the provided knowledge graph.
+
+- Steps -
+1. Entity Recognition
+- Accurately recognize entity information in the [Entities:] section, including entity names and descriptions.
+- The general formats for entity information are:
+(ENTITY_NAME: ENTITY_DESCRIPTION)
+
+2. Relationship Recognition
+- Accurately recognize relationship information in the [Relationships:] section, including source_entity_name, target_entity_name, and relationship descriptions.
+- The general formats for relationship information are:
+(SOURCE_ENTITY_NAME)-[RELATIONSHIP_DESCRIPTION]->(TARGET_ENTITY_NAME)
+
+3. Graph Structure Understanding
+- Correctly associate the source entity name in the relationship information with the entity information.
+- Reconstruct the graph structure based on the provided relationship information.
+
+4. Question Design
+- Design a question around the "core theme" expressed by the knowledge graph.
+- The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments.
+- The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions.
+
+5. Reasoning-Path Design
+- Output a **blueprint that any later model can directly execute**.
+- Keep steps minimal: each step solves one indivisible sub-problem.
+
+
+- Constraints -
+1. Do NOT describe your thinking; output only the reasoning-path design.
+2. If the provided descriptions are contradictory, resolve conflicts and provide a single coherent logic.
+3. Avoid using stop words and overly common words.
+4. Do not include specific numerical values or conclusions, \
+and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
+5. Use English as the output language.
+6. The output format is:
+Question:
+Reasoning-Path Design:
+
+Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.
+
+- Real Data -
+Input:
+[Entities:]:
+{entities}
+
+[Relationships:]:
+{relationships}
+
+Output:
+"""
+
+COT_GENERATION_PROMPT = {
+    "en": {
+        "COT_GENERATION": COT_GENERATION_EN,
+        "COT_TEMPLATE_DESIGN": COT_TEMPLATE_DESIGN_EN,
+    },
+    "zh": {
+        "COT_GENERATION": COT_GENERATION_ZH,
+        "COT_TEMPLATE_DESIGN": COT_TEMPLATE_DESIGN_ZH,
+    },
+}
diff --git a/graphgen/templates/multi_hop_generation.py b/graphgen/templates/generation/multi_hop_generation.py
similarity index 95%
rename from graphgen/templates/multi_hop_generation.py
rename to graphgen/templates/generation/multi_hop_generation.py
index dad2ee36..73857ebb 100644
--- a/graphgen/templates/multi_hop_generation.py
+++ b/graphgen/templates/generation/multi_hop_generation.py
@@ -1,5 +1,4 @@
 # pylint: disable=C0301
-
 TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图，其中包含一系列实体、关系和事实。你的任务是提出一个问题，该问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中，需要多步推理才能回答。
 
 例如：
@@ -54,7 +53,4 @@
 Output the generated question and answer directly, please do not copy the example question and answer directly, and do not provide irrelevant information.
 """
 
-MULTI_HOP_GENERATION_PROMPT = {
-    "English": TEMPLATE_EN,
-    "Chinese": TEMPLATE_ZH
-}
+MULTI_HOP_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
diff --git a/graphgen/templates/generation/omics_qa_generation.py b/graphgen/templates/generation/omics_qa_generation.py
new file mode 100644
index 00000000..1d356e62
--- /dev/null
+++ b/graphgen/templates/generation/omics_qa_generation.py
@@ -0,0 +1,99 @@
+# pylint: disable=C0301
+OMICS_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in multi-omics data analysis (genomics, transcriptomics, proteomics). Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given biological sample described by the provided ENTITIES and RELATIONSHIPS.
+Use English as the output language.
+
+---Objectives---
+Create multiple sets of omics-centric QA pairs that satisfy the following:
+1. Only ask about objectively existing facts in the provided data (e.g., gene names, sequence information, functional annotations, regulatory elements, structural features, experimental metadata, etc.). Avoid subjective or speculative questions.
+2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships.
+3. Questions should cover diverse aspects: sequence, structure, function, interactions, regulation, experimental annotations, etc.
+4. Avoid repetitive questions; each question must be unique and meaningful.
+5. Use concise, unambiguous language; do not invent information beyond the provided data.
+
+---Instructions---
+1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify:
+   - Biological entities (genes, proteins, RNA molecules, regulatory elements, pathways, etc.)
+   - Sequence information (DNA sequences, RNA sequences, protein sequences)
+   - Functional annotations (gene function, protein function, RNA function, biological processes)
+   - Structural features (chromosomal location, genomic coordinates, domain structures, etc.)
+   - Regulatory relationships (transcription, translation, regulation, interaction)
+   - Experimental metadata (database IDs, organism, experimental methods, etc.)
+2. Organise information logically:
+   - Start with sequence/primary structure information
+   - Proceed to functional annotations and biological roles
+   - Include regulatory relationships and interactions
+   - End with experimental context and metadata
+3. Maintain scientific accuracy and consistent nomenclature (standard gene names, sequence identifiers, etc.).
+4. Review each QA pair to guarantee logical consistency and absence of hallucination.
+
+################
+-ENTITIES-
+################
+{entities}
+
+################
+-RELATIONSHIPS-
+################
+{relationships}
+################
+Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text.
+
+Question: <Question1>
+Answer: <Answer1>
+
+Question: <Question2>
+Answer: <Answer2>
+
+"""
+
+OMICS_QA_TEMPLATE_ZH: str = """你是一位资深的多组学数据计算生物学家（基因组学、转录组学、蛋白质组学）。你的任务是根据下述提供的实体与关系，为给定的生物样本生成逻辑连贯、可验证、无幻觉的中英双语问答对（这里仅输出中文）。
+使用中文作为输出语言。
+
+---目标---
+创建多组以组学数据为中心的问答对，满足：
+1. 仅询问数据中客观存在的事实（如基因名称、序列信息、功能注释、调控元件、结构特征、实验元数据等），避免主观或推测性问题。
+2. 每个问题必须有单一、明确且可直接验证的答案，答案必须能从给定实体/关系中直接确认。
+3. 问题需覆盖：序列、结构、功能、相互作用、调控、实验注释等多个维度，确保多样性与全面性。
+4. 避免重复提问，每个问题都独特且有意义。
+5. 语言简洁、无歧义，严禁编造超出给定数据的信息。
+
+---说明---
+1. 仔细分析提供的实体与关系，识别：
+   - 生物实体（基因、蛋白质、RNA分子、调控元件、通路等）
+   - 序列信息（DNA序列、RNA序列、蛋白质序列）
+   - 功能注释（基因功能、蛋白质功能、RNA功能、生物学过程）
+   - 结构特征（染色体位置、基因组坐标、结构域等）
+   - 调控关系（转录、翻译、调控、相互作用）
+   - 实验元数据（数据库ID、生物体、实验方法等）
+2. 按逻辑顺序组织信息：
+   - 从序列/一级结构信息入手
+   - 再到功能注释和生物学作用
+   - 包括调控关系和相互作用
+   - 最后到实验背景和元数据
+3. 保持科学准确性，使用统一命名规范（标准基因名、序列标识符等）。
+4. 检查每对问答，确保逻辑一致且无幻觉。
+
+################
+-实体-
+################
+{entities}
+
+################
+-关系-
+################
+{relationships}
+################
+请直接在下方输出生成的问答对，不要复制任何示例，不要输出无关内容。
+
+问题： <问题1>
+答案： <答案1>
+
+问题： <问题2>
+答案： <答案2>
+
+"""
+
+OMICS_QA_GENERATION_PROMPT = {
+    "en": OMICS_QA_TEMPLATE_EN,
+    "zh": OMICS_QA_TEMPLATE_ZH,
+}
diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py
new file mode 100644
index 00000000..4826be0e
--- /dev/null
+++ b/graphgen/templates/generation/vqa_generation.py
@@ -0,0 +1,104 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are a senior VQA data engineer. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given multi-modal samples.
+Use English as the output language.
+
+---Objectives---
+Create multiple sets of VQA question-answer pairs that satisfy the following:
+1. Only ask about objectively existing facts in the given data, avoiding subjective or ambiguous questions.
+2. Ensure that each question has a clear and verifiable answer, avoiding questions with no answer or uncertainty.
+3. Questions should cover various aspects of both image and text content, ensuring diversity and comprehensiveness.
+4. Avoid repetitive questions, ensuring that each question is unique and meaningful.
+5. Use clear and concise language, avoiding complex or ambiguous wording.
+
+---Instructions---
+1. Carefully analyze the provided entities and relationships to identify:
+    - Key concepts and their hierarchical relationships
+    - Temporal sequences and time order
+    - Cause-and-effect relationships
+    - Dependencies between different elements
+2. Organize the information into a logical sequence by:
+    - Starting with foundational concepts
+    - Gradually building up to more complex relationships
+    - Grouping related ideas together
+    - Creating clear transitions between sections
+3. Maintain the following when generating question-answer pairs:
+    - Logical flow
+    - Clear connections between concepts
+    - Appropriate context and background
+    - Coherent narrative structure
+4. Review and refine the question-answer pairs to ensure:
+    - Overall logical consistency
+    - Clear cause-and-effect relationships
+
+################
+-Entities-
+################
+{entities}
+################
+-Relationships-
+################
+{relationships}
+################
+Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information.
+Here is the response format you should follow:
+Question: <Question1>
+Answer: <Answer1>
+
+Question: <Question2>
+Answer: <Answer2>
+
+"""
+
+TEMPLATE_ZH: str = """---角色---
+你是一位资深 VQA 数据工程师。你需要为给定的多模态样本生成逻辑连贯、可验证、无幻觉的问答对。
+使用中文作为输出语言。
+
+---目标---
+创建多组 VQA 问答对，满足：
+1. 仅询问给定数据中客观存在的事实，避免主观或模糊的问题。
+2. 确保每个问题都有明确且可验证的答案，避免无答案或不确定的问题。
+3. 问题应涵盖图像和文本内容的各个方面，确保多样性和全面性。
+4. 避免重复问题，确保每个问题都是独特且有意义的。
+5. 使用清晰简洁的语言，避免复杂或含糊的措辞。
+
+---说明---
+1. 仔细分析提供的实体和关系，以识别：
+    - 关键概念及其层级关系
+    - 时间序列和时间顺序
+    - 因果关系
+    - 不同元素之间的依赖关系
+2. 通过以下方式将信息组织成逻辑顺序：
+    - 从基础概念开始
+    - 逐步建立更复杂的关系
+    - 将相关的想法分组在一起
+    - 在各部分之间创建清晰的过渡
+3. 生成问答对时保持：
+    - 逻辑流畅
+    - 概念之间的清晰联系
+    - 适当的上下文和背景
+    - 连贯的叙述结构
+4. 检查和完善问答对以确保：
+    - 整体逻辑一致性
+    - 清晰的因果关系
+
+################
+-实体-
+################
+{entities}
+
+################
+-关系-
+################
+{relationships}
+################
+直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
+以下是你应该遵循的响应格式：
+问题： <问题1>
+答案： <答案1>
+
+问题： <问题2>
+答案： <答案2>
+
+"""
+
+VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py
new file mode 100644
index 00000000..e39c1408
--- /dev/null
+++ b/graphgen/templates/kg/__init__.py
@@ -0,0 +1,5 @@
+from .kg_extraction import KG_EXTRACTION_PROMPT
+from .kg_summarization import KG_SUMMARIZATION_PROMPT
+from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT
+from .omics_kg_extraction import OMICS_KG_EXTRACTION_PROMPT
+from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT
diff --git a/graphgen/templates/kg_extraction.py b/graphgen/templates/kg/kg_extraction.py
similarity index 98%
rename from graphgen/templates/kg_extraction.py
rename to graphgen/templates/kg/kg_extraction.py
index 8d98bb95..930cbb99 100644
--- a/graphgen/templates/kg_extraction.py
+++ b/graphgen/templates/kg/kg_extraction.py
@@ -1,10 +1,9 @@
 # pylint: disable=C0301
-
 TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships.
 
 -Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
-Use {language} as output language.
+Use English as output language.
 
 -Steps-
 1. Identify all entities. For each identified entity, extract the following information:
@@ -23,7 +22,7 @@
 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
 Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
 
-4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
 5. When finished, output {completion_delimiter}
 
@@ -85,7 +84,7 @@
 
 -目标-
 给定一个实体类型列表和可能与列表相关的文本，从文本中识别所有这些类型的实体，以及这些实体之间所有的关系。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 -步骤-
 1. 识别所有实体。对于每个识别的实体，提取以下信息：
@@ -93,7 +92,7 @@
    - entity_type：以下类型之一：[{entity_types}]
    - entity_summary：实体的属性与活动的全面总结
    将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
-   
+
 2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
    对于每对相关的实体，提取以下信息：
    - source_entity：步骤1中识别的源实体名称
@@ -189,12 +188,12 @@
 IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
 
 KG_EXTRACTION_PROMPT: dict = {
-    "English": {
+    "en": {
         "TEMPLATE": TEMPLATE_EN,
         "CONTINUE": CONTINUE_EN,
         "IF_LOOP": IF_LOOP_EN,
     },
-    "Chinese": {
+    "zh": {
         "TEMPLATE": TEMPLATE_ZH,
         "CONTINUE": CONTINUE_ZH,
         "IF_LOOP": IF_LOOP_ZH,
@@ -205,6 +204,5 @@
         "completion_delimiter": "<|COMPLETE|>",
         "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
 science, technology, mission, gene",
-        "language": "English",
     },
 }
diff --git a/graphgen/templates/kg_summarization.py b/graphgen/templates/kg/kg_summarization.py
similarity index 86%
rename from graphgen/templates/kg_summarization.py
rename to graphgen/templates/kg/kg_summarization.py
index 7cf75180..3e7cb6cb 100644
--- a/graphgen/templates/kg_summarization.py
+++ b/graphgen/templates/kg/kg_summarization.py
@@ -3,7 +3,7 @@
 Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
 If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
 Make sure it is written in third person, and include the entity names so we the have full context.
-Use {language} as output language.
+Use English as output language.
 
 #######
 -Data-
@@ -18,7 +18,7 @@
 请将所有这些描述整合成一个综合描述。确保包含所有描述中收集的信息。
 如果提供的描述是矛盾的，请解决这些矛盾并提供一个连贯的总结。
 确保以第三人称写作，并包含实体名称，以便我们有完整的上下文。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 #######
 -数据-
@@ -30,14 +30,9 @@
 
 
 KG_SUMMARIZATION_PROMPT = {
-    "Chinese": {
-        "TEMPLATE": TEMPLATE_ZH
-    },
-    "English": {
-        "TEMPLATE": TEMPLATE_EN
-    },
+    "zh": {"TEMPLATE": TEMPLATE_ZH},
+    "en": {"TEMPLATE": TEMPLATE_EN},
     "FORMAT": {
-        "language": "English",
         "tuple_delimiter": "<|>",
         "record_delimiter": "##",
         "completion_delimiter": "<|COMPLETE|>",
diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py
new file mode 100644
index 00000000..28327175
--- /dev/null
+++ b/graphgen/templates/kg/mm_kg_extraction.py
@@ -0,0 +1,131 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are an expert in multi-modal data analysis and knowledge graph construction. Your task is to extract named entities and relationships from a given multi-modal data chunk and its accompanying text.
+
+-Objective-
+Given a multi-modal data chunk (e.g., image, table, formula, etc. + accompanying text), construct a knowledge graph centered around the "central multi-modal entity":
+- The central entity must be the image/table/formula itself (e.g., image-c71ef797e99af81047fbc7509609c765).
+- Related entities and relationships must be extracted from the accompanying text.
+- Only retain edges directly connected to the central entity, forming a star-shaped graph.
+Use English as the output language.
+
+-Steps-
+1. Identify the unique central multi-modal entity and recognize all text entities directly related to the central entity from the accompanying text.
+    For the central entity, extract the following information:
+    - entity_name: Use the unique identifier of the data chunk (e.g., image-c71ef797e99af81047fbc7509609c765).
+    - entity_type: Label according to the type of data chunk (image, table, formula, etc.).
+    - entity_summary: A brief description of the content of the data chunk and its role in the accompanying text.
+    For each entity recognized from the accompanying text, extract the following information:
+    - entity_name: The name of the entity, capitalized
+    - entity_type: One of the following types: [{entity_types}]
+    - entity_summary: A comprehensive summary of the entity's attributes and activities
+    Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. From the entities identified in Step 1, recognize all (source_entity, target_entity) pairs that are *obviously related* to each other.
+    For each pair of related entities, extract the following information:
+    - source_entity: The name of the source entity identified in Step 1
+    - target_entity: The name of the target entity identified in Step 1
+    - relationship_summary: Explain why you think the source entity and target entity are related to each other
+    Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator.
+
+4. Upon completion, output {completion_delimiter}
+
+################
+-Example-
+################
+Multi-modal data chunk type: image
+Multi-modal data chunk unique identifier: image-c71ef797e99af81047fbc7509609c765
+Accompanying text: The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889. It stands 324 meters tall and is one of the tallest structures in the world. The Eiffel Tower is located on the banks of the Seine River and attracts millions of visitors each year. It is not only an engineering marvel but also an important symbol of French culture.
+################
+Output:
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"This is an image showcasing the iconic structure in Paris, France, the Eiffel Tower, highlighting its full height of 324 meters along with the riverside scenery, symbolizing both engineering and cultural significance"){record_delimiter}
+("entity"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"landmark"{tuple_delimiter}"The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889, standing 324 meters tall, located on the banks of the Seine River, attracting millions of visitors each year"){record_delimiter}
+("entity"{tuple_delimiter}"Paris, France"{tuple_delimiter}"location"{tuple_delimiter}"Paris, France is the capital of France, known for its rich historical and cultural heritage and as the location of the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"person"{tuple_delimiter}"Gustave Eiffel is a renowned French engineer who designed and built the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Seine River"{tuple_delimiter}"location"{tuple_delimiter}"The Seine River is a major river flowing through Paris, France, with the Eiffel Tower located on its banks"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"The image showcases the iconic structure, the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Paris, France"{tuple_delimiter}"The image's background is Paris, France, highlighting the geographical location of the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"The Eiffel Tower in the image was designed by Gustave Eiffel"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Seine River"{tuple_delimiter}"The image showcases the scenery of the Eiffel Tower located on the banks of the Seine River"){completion_delimiter}
+################
+
+-Real Data-
+Multi-modal data chunk type: {chunk_type}
+Multi-modal data chunk unique identifier: {chunk_id}
+Accompanying text: {chunk_text}
+################
+Output:
+"""
+
+TEMPLATE_ZH: str = """你是一个多模态数据分析和知识图谱构建专家。你的任务是从给定的多模态数据块及其伴随文本中抽取命名实体与关系。
+
+-目标-
+给定一个多模态数据块（例如图像、表格、公式等 + 伴随文本），构建以「中心多模态实体」为核心的知识图：
+- 中心实体必须是图像/表格/公式本身（如 image-c71ef797e99af81047fbc7509609c765）。
+- 相关实体和关系必须从伴随文本中抽取。
+- 只保留与中心实体直接相连的边，形成星型图。
+使用中文作为输出语言。
+
+-步骤-
+1. 确定唯一的中心多模态实体，从伴随文本中识别所有与中心实体直接相关的文本实体。
+   对于中心实体，提取以下信息：
+    - entity_name：使用数据块的唯一标识符（如 image-c71ef797e99af81047fbc7509609c765）。
+    - entity_type：根据数据块类型（图像、表格、公式等）进行标注。
+    - entity_summary：简要描述数据块的内容和其在伴随文本中的作用。
+   对于从伴随文本中识别的每个实体，提取以下信息：
+    - entity_name：实体的名称，首字母大写
+    - entity_type：以下类型之一：[{entity_types}]
+    - entity_summary：实体的属性与活动的全面总结
+    将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
+   对于每对相关的实体，提取以下信息：
+   - source_entity：步骤1中识别的源实体名称
+   - target_entity：步骤1中识别的目标实体名称
+   - relationship_summary：解释为什么你认为源实体和目标实体彼此相关
+   将每个关系格式化为("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。
+
+4. 完成后，输出{completion_delimiter}
+
+################
+-示例-
+################
+多模态数据块类型：image
+多模态数据块唯一标识符：image-c71ef797e99af81047fbc7509609c765
+伴随文本：埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成。它高324米，是世界上最高的建筑之一。埃菲尔铁塔位于塞纳河畔，吸引了数百万游客前来参观。它不仅是工程学的奇迹，也是法国文化的重要象征。
+################
+输出：
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"这是一张展示法国巴黎标志性建筑的图像，主体为埃菲尔铁塔，呈现其324米高度的全貌与河畔景观，具有工程与文化双重象征意义"){record_delimiter}
+("entity"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"landmark"{tuple_delimiter}"埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成，高324米，是世界上最高的建筑之一，位于塞纳河畔，吸引了数百万游客前来参观"){record_delimiter}
+("entity"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"location"{tuple_delimiter}"法国巴黎是法国的首都，以其丰富的历史文化遗产和作为埃菲尔铁塔所在地而闻名"){record_delimiter}
+("entity"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"person"{tuple_delimiter}"古斯塔夫·埃菲尔是法国著名的工程师，设计并建造了埃菲尔铁塔"){record_delimiter}
+("entity"{tuple_delimiter}"塞纳河"{tuple_delimiter}"location"{tuple_delimiter}"塞纳河是流经法国巴黎的重要河流，埃菲尔铁塔位于其畔"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"图像展示了埃菲尔铁塔这一标志性建筑"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"图像背景为法国巴黎，突显了埃菲尔铁塔的地理位置"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"图像中的埃菲尔铁塔是由古斯塔夫·埃菲尔设计的"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"塞纳河"{tuple_delimiter}"图像展示了埃菲尔铁塔位于塞纳河畔的景观"){completion_delimiter}
+################
+
+-真实数据-
+多模态数据块类型： {chunk_type}
+多模态数据块唯一标识符： {chunk_id}
+伴随文本： {chunk_text}
+################
+输出：
+"""
+
+
+MMKG_EXTRACTION_PROMPT: dict = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
+science, technology, mission, gene",
+    },
+}
diff --git a/graphgen/templates/kg/omics_kg_extraction.py b/graphgen/templates/kg/omics_kg_extraction.py
new file mode 100644
index 00000000..d105dd38
--- /dev/null
+++ b/graphgen/templates/kg/omics_kg_extraction.py
@@ -0,0 +1,209 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are a bioinformatics expert, skilled at analyzing biological sequences (DNA, RNA, protein) and their metadata to extract biological entities and their relationships.
+
+-Goal-
+Given a biological sequence chunk (DNA, RNA, or protein) along with its metadata, identify all relevant biological entities and their relationships.
+Use English as output language.
+
+-Steps-
+1. Identify all biological entities. For each identified entity, extract the following information:
+- entity_name: Name of the entity (gene name, protein name, RNA name, domain name, etc.), capitalized
+- entity_type: One of the following types: [{entity_types}]
+- entity_summary: Comprehensive summary of the entity's biological function, structure, or properties
+Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *biologically related* to each other.
+For each pair of related entities, extract the following information:
+- source_entity: name of the source entity, as identified in step 1
+- target_entity: name of the target entity, as identified in step 1
+- relationship_summary: explanation of the biological relationship (e.g., encodes, transcribes, translates, interacts, regulates, homologous_to, located_in, etc.)
+Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. Identify high-level key words that summarize the main biological concepts, functions, or themes.
+Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
+
+4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+
+5. When finished, output {completion_delimiter}
+
+################
+-Examples-
+################
+-Example 1-
+Sequence Metadata:
+################
+molecule_type: DNA
+database: NCBI
+id: NG_033923
+gene_name: BRCA1
+gene_description: BRCA1 DNA repair associated
+organism: Homo sapiens
+gene_type: protein-coding
+chromosome: 17
+genomic_location: 43044295-43125483
+function: BRCA1 is a tumor suppressor gene involved in DNA repair
+sequence_chunk: ATGCGATCGATCGATCG... (first 500bp of BRCA1 gene)
+################
+Output:
+("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1 is a protein-coding tumor suppressor gene located on chromosome 17 in humans, involved in DNA repair mechanisms."){record_delimiter}
+("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which BRCA1 gene is found."){record_delimiter}
+("entity"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"location"{tuple_delimiter}"Chromosome 17 is the chromosomal location of the BRCA1 gene in humans."){record_delimiter}
+("entity"{tuple_delimiter}"DNA repair"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA repair is a biological process in which BRCA1 is involved as a tumor suppressor."){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1 is a gene found in Homo sapiens."){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"BRCA1 is located on chromosome 17 in the human genome."){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA repair"{tuple_delimiter}"BRCA1 is involved in DNA repair processes as a tumor suppressor gene."){record_delimiter}
+("content_keywords"{tuple_delimiter}"tumor suppressor, DNA repair, genetic disease, cancer genetics"){completion_delimiter}
+
+-Example 2-
+Sequence Metadata:
+################
+molecule_type: RNA
+database: RNAcentral
+id: URS0000000001
+rna_type: miRNA
+description: hsa-let-7a-1 microRNA
+organism: Homo sapiens
+related_genes: ["LIN28", "HMGA2"]
+sequence_chunk: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG
+################
+Output:
+("entity"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"rna"{tuple_delimiter}"hsa-let-7a-1 is a microRNA (miRNA) found in Homo sapiens, involved in gene regulation."){record_delimiter}
+("entity"{tuple_delimiter}"LIN28"{tuple_delimiter}"gene"{tuple_delimiter}"LIN28 is a gene related to hsa-let-7a-1 microRNA, involved in RNA processing and development."){record_delimiter}
+("entity"{tuple_delimiter}"HMGA2"{tuple_delimiter}"gene"{tuple_delimiter}"HMGA2 is a gene related to hsa-let-7a-1 microRNA, involved in chromatin structure and gene expression."){record_delimiter}
+("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which hsa-let-7a-1 is found."){record_delimiter}
+("entity"{tuple_delimiter}"microRNA"{tuple_delimiter}"rna_type"{tuple_delimiter}"MicroRNA is a type of small non-coding RNA involved in post-transcriptional gene regulation."){record_delimiter}
+("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"hsa-let-7a-1 is a microRNA found in Homo sapiens."){record_delimiter}
+("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"LIN28"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to LIN28 gene, potentially regulating its expression."){record_delimiter}
+("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"HMGA2"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to HMGA2 gene, potentially regulating its expression."){record_delimiter}
+("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"microRNA"{tuple_delimiter}"hsa-let-7a-1 belongs to the microRNA class of RNA molecules."){record_delimiter}
+("content_keywords"{tuple_delimiter}"microRNA, gene regulation, post-transcriptional control, RNA processing"){completion_delimiter}
+
+-Example 3-
+Sequence Metadata:
+################
+molecule_type: protein
+database: UniProt
+id: P01308
+protein_name: Insulin
+organism: Homo sapiens
+function: ["Regulates glucose metabolism", "Hormone signaling"]
+sequence_chunk: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
+################
+Output:
+("entity"{tuple_delimiter}"Insulin"{tuple_delimiter}"protein"{tuple_delimiter}"Insulin is a protein hormone in Homo sapiens that regulates glucose metabolism and hormone signaling."){record_delimiter}
+("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which Insulin is produced."){record_delimiter}
+("entity"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"biological_process"{tuple_delimiter}"Glucose metabolism is a biological process regulated by Insulin."){record_delimiter}
+("entity"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"biological_process"{tuple_delimiter}"Hormone signaling is a biological process in which Insulin participates as a signaling molecule."){record_delimiter}
+("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"Insulin is a protein produced in Homo sapiens."){record_delimiter}
+("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"Insulin regulates glucose metabolism in the body."){record_delimiter}
+("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"Insulin participates in hormone signaling pathways."){record_delimiter}
+("content_keywords"{tuple_delimiter}"hormone, metabolism, glucose regulation, signaling pathway"){completion_delimiter}
+
+################
+-Real Data-
+################
+Entity_types: {entity_types}
+Sequence Metadata: {metadata_text}
+Sequence Chunk: {sequence_chunk}
+################
+Output:
+"""
+
+
+TEMPLATE_ZH: str = """你是一个生物信息学专家，擅长分析生物序列（DNA、RNA、蛋白质）及其元数据，提取生物实体及其关系。
+
+-目标-
+给定一个生物序列片段（DNA、RNA或蛋白质）及其元数据，识别所有相关的生物实体及其关系。
+使用中文作为输出语言。
+
+-步骤-
+1. 识别所有生物实体。对于每个识别的实体，提取以下信息：
+   - entity_name：实体的名称（基因名、蛋白质名、RNA名、功能域名等），首字母大写
+   - entity_type：以下类型之一：[{entity_types}]
+   - entity_summary：实体生物学功能、结构或属性的全面总结
+   将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*在生物学上相关*。
+   对于每对相关的实体，提取以下信息：
+   - source_entity：步骤1中识别的源实体名称
+   - target_entity：步骤1中识别的目标实体名称
+   - relationship_summary：生物学关系的解释（例如：编码、转录、翻译、相互作用、调控、同源、位于等）
+   将每个关系格式化为("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. 识别总结主要生物学概念、功能或主题的高级关键词。
+   将内容级关键词格式化为("content_keywords"{tuple_delimiter}<high_level_keywords>)
+
+4. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。
+
+5. 完成后，输出{completion_delimiter}
+
+################
+-示例-
+################
+-示例 1-
+序列元数据：
+################
+molecule_type: DNA
+database: NCBI
+id: NG_033923
+gene_name: BRCA1
+gene_description: BRCA1 DNA repair associated
+organism: Homo sapiens
+gene_type: protein-coding
+chromosome: 17
+genomic_location: 43044295-43125483
+function: BRCA1 is a tumor suppressor gene involved in DNA repair
+sequence_chunk: ATGCGATCGATCGATCG... (BRCA1基因的前500bp)
+################
+输出：
+("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1是位于人类17号染色体上的蛋白质编码肿瘤抑制基因，参与DNA修复机制。"){record_delimiter}
+("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"人类，BRCA1基因所在的生物体。"){record_delimiter}
+("entity"{tuple_delimiter}"17号染色体"{tuple_delimiter}"location"{tuple_delimiter}"17号染色体是BRCA1基因在人类基因组中的位置。"){record_delimiter}
+("entity"{tuple_delimiter}"DNA修复"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA修复是BRCA1作为肿瘤抑制基因参与的生物学过程。"){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1是在人类中发现的基因。"){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"17号染色体"{tuple_delimiter}"BRCA1位于人类基因组的17号染色体上。"){record_delimiter}
+("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA修复"{tuple_delimiter}"BRCA1作为肿瘤抑制基因参与DNA修复过程。"){record_delimiter}
+("content_keywords"{tuple_delimiter}"肿瘤抑制, DNA修复, 遗传疾病, 癌症遗传学"){completion_delimiter}
+
+################
+-真实数据-
+################
+实体类型：{entity_types}
+序列元数据：{metadata_text}
+序列片段：{sequence_chunk}
+################
+输出：
+"""
+
+
+CONTINUE_EN: str = """MANY entities and relationships were missed in the last extraction. \
+Add them below using the same format:
+"""
+
+CONTINUE_ZH: str = """很多实体和关系在上一次的提取中可能被遗漏了。请在下面使用相同的格式添加它们："""
+
+IF_LOOP_EN: str = """It appears some entities and relationships may have still been missed. \
+Answer YES | NO if there are still entities and relationships that need to be added.
+"""
+
+IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
+
+OMICS_KG_EXTRACTION_PROMPT: dict = {
+    "en": {
+        "TEMPLATE": TEMPLATE_EN,
+        "CONTINUE": CONTINUE_EN,
+        "IF_LOOP": IF_LOOP_EN,
+    },
+    "zh": {
+        "TEMPLATE": TEMPLATE_ZH,
+        "CONTINUE": CONTINUE_ZH,
+        "IF_LOOP": IF_LOOP_ZH,
+    },
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "gene, rna, protein, organism, location, biological_process, rna_type, protein_domain, \
+mutation, pathway, disease, function, structure",
+    },
+}
diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py
new file mode 100644
index 00000000..16e9c868
--- /dev/null
+++ b/graphgen/templates/kg/protein_kg_extraction.py
@@ -0,0 +1,144 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction.
+Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text.
+
+-Goal-
+Given free-text that discusses one or more proteins, identify:
+1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user).
+2. All entities that are **directly related** to this central protein.
+3. All relationships that **directly link** those entities to the central protein (star edges).
+
+Use English as the output language. Please provide only the result without any extra explanations.
+
+-Steps-
+1. Identify the **central protein entity** and all **directly-related entities** from the text.
+   For the **central protein**, extract:
+   - entity_name: use the full name or UniProt ID if given; capitalized.
+   - entity_type: always `protein`.
+   - entity_summary: concise description of its main biological role, location, or significance in the text.
+
+   For each **directly-related entity**, extract:
+   - entity_name: capitalized.
+   - entity_type: one of [{entity_types}].
+   - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**.
+
+   Format each entity as
+   ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**.
+   For each pair extract:
+   - source_entity: the **central protein** name.
+   - target_entity: the related entity name.
+   - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**.
+
+   Format each relationship as
+   ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter.
+
+4. Finish by printing {completion_delimiter}
+
+################
+-Example-
+################
+Text:
+################
+The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage.
+Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability.
+MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination.
+################
+Output:
+("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter}
+("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter}
+("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter}
+("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter}
+("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter}
+
+################
+-Real Data-
+Entity_types: {entity_types}
+Text: {input_text}
+################
+Output:
+"""
+
+
+TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。
+任务：从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。
+
+-目标-
+文本可能提及一个或多个蛋白质，请：
+1. 确定**中心蛋白质**（文本首个提及或用户指定的蛋白）。
+2. 识别所有与中心蛋白**直接相关**的实体。
+3. 仅保留**中心蛋白→相关实体**的直接关系（星型边）。
+
+使用中文输出。请直接给出结果，不要增加额外说明。
+
+-步骤-
+1. 确定**中心蛋白质实体**及所有**直接相关实体**。
+   对于**中心蛋白质**：
+   - entity_name：全名或UniProt ID，首字母大写。
+   - entity_type：固定为`protein`。
+   - entity_summary：简述其在文中的生物学功能、定位或意义。
+
+   对于每个**直接相关实体**：
+   - entity_name：首字母大写。
+   - entity_type：可选类型[{entity_types}]。
+   - entity_summary：全面总结其在文中与中心蛋白相关的属性/活动。
+
+   格式：("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. 在步骤1的实体中，列出所有**（中心蛋白→相关实体）**的明显关系对。
+   每对提取：
+   - source_entity：中心蛋白名称。
+   - target_entity：相关实体名称。
+   - relationship_summary：简要说明文中二者如何直接关联。
+
+   格式：("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. 将步骤1–2的所有实体与关系合并为单列表，用**{record_delimiter}**分隔。
+
+4. 输出结束标记{completion_delimiter}
+
+################
+-示例-
+################
+文本：
+################
+肿瘤抑制蛋白p53是一种转录因子，可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53，增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。
+################
+输出：
+("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子，能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter}
+("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶，在丝氨酸-15位点磷酸化p53，从而提高其稳定性。"){record_delimiter}
+("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter}
+("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶，通过泛素化负调控p53。"){record_delimiter}
+("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号，可激活p53介导的转录应答。"){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53，增强其稳定性。"){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化，负向调控其活性。"){record_delimiter}
+("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter}
+
+################
+-真实数据-
+实体类型：{entity_types}
+文本：{input_text}
+################
+输出：
+"""
+
+
+PROTEIN_KG_EXTRACTION_PROMPT: dict = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, "
+        "experiment, technology, concept, location, organization, person, mission, science",
+    },
+}
diff --git a/graphgen/templates/question_generation.py b/graphgen/templates/question_generation.py
index d9ca9128..e75bf169 100644
--- a/graphgen/templates/question_generation.py
+++ b/graphgen/templates/question_generation.py
@@ -1,47 +1,5 @@
 # pylint: disable=C0301
-TEMPLATE_SINGLE_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
 
-################
-Answer:
-{answer}
-################
-Question:
-"""
-
-TEMPLATE_SINGLE_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
-
-################
-答案：
-{answer}
-################
-问题：
-"""
-
-TEMPLATE_SINGLE_QA_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
-The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. 
-For example:
-Question: What is the effect of overexpressing the BG1 gene on grain size and development?
-Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
-
-Question: What role does TAC4 play in the gravitropism of rice shoots?
-Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector.
-
-Here is the text passage you need to generate a QA pair for:
-{doc}
-"""
-
-TEMPLATE_SINGLE_QA_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答（QA）对。
-答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
-例如：
-问题：过表达BG1基因对谷粒大小和发育有什么影响？
-答案：BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。
-
-问题：TAC4在水稻茎的重力性状中扮演什么角色？
-答案：TAC4是水稻茎重力性状的关键调节因子，促进茎向重力矢量弯曲。
-
-以下是你需要为其生成QA对的文本段落：
-{doc}
-"""
 
 # TODO: 修改这里的prompt
 TEMPLATE_MULTI_EN = """You are an assistant to help read a article and then rephrase it in a question answering format. The user will provide you with an article with its content. You need to generate a paraphrase of the same article in question and answer format with one tag of "Question: ..." followed by "Answer: ...". Remember to keep the meaning and every content of the article intact.
@@ -66,13 +24,9 @@
 
 QUESTION_GENERATION_PROMPT = {
     "English": {
-        "SINGLE_TEMPLATE": TEMPLATE_SINGLE_EN,
-        "SINGLE_QA_TEMPLATE": TEMPLATE_SINGLE_QA_EN,
-        "MULTI_TEMPLATE": TEMPLATE_MULTI_EN
+        "MULTI_TEMPLATE": TEMPLATE_MULTI_EN,
     },
     "Chinese": {
-        "SINGLE_TEMPLATE": TEMPLATE_SINGLE_ZH,
-        "SINGLE_QA_TEMPLATE": TEMPLATE_SINGLE_QA_ZH,
-        "MULTI_TEMPLATE": TEMPLATE_MULTI_ZH
-    }
+        "MULTI_TEMPLATE": TEMPLATE_MULTI_ZH,
+    },
 }
diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py
index ca9e7e12..06837ac8 100644
--- a/graphgen/templates/search_judgement.py
+++ b/graphgen/templates/search_judgement.py
@@ -1,23 +1,23 @@
 # pylint: disable=C0301
 
 TEMPLATE: str = """-Goal-
-Please select the most relevant search result for the given entity. 
-The name and description of the entity are provided. The search results are provided as a list.
-Please select the most relevant search result from the list. If none of the search results are relevant, please select 'None of the above'.
+Please select the most relevant searcher result for the given entity.
+The name and description of the entity are provided. The searcher results are provided as a list.
+Please select the most relevant searcher result from the list. If none of the searcher results are relevant, please select 'None of the above'.
 
 Steps:
 1. Read the name and description of the entity.
 
-2. Read the search results. For each search result, compare it with the entity name and description to determine if it is relevant.
+2. Read the searcher results. For each searcher result, compare it with the entity name and description to determine if it is relevant.
 
-3. Select the most relevant search result from the list. If none of the search results are relevant, select 'None of the above'.
+3. Select the most relevant searcher result from the list. If none of the searcher results are relevant, select 'None of the above'.
 
 4. Output your selection directly, please do not provide any additional information.
 
 ################
 -Examples-
 ################
-{examples}
+{input_examples}
 
 ################
 -Real Data-
diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py
index 932f8df1..ec118816 100644
--- a/graphgen/utils/__init__.py
+++ b/graphgen/utils/__init__.py
@@ -1,9 +1,22 @@
-from .log import logger, set_logger, parse_log
-from .loop import create_event_loop
-from .format import (pack_history_conversations, split_string_by_multi_markers,
-                     handle_single_entity_extraction, handle_single_relationship_extraction,
-                     load_json, write_json)
-from .hash import compute_content_hash, compute_args_hash
-from .detect_lang import detect_main_language, detect_if_chinese
 from .calculate_confidence import yes_no_loss_entropy
+from .detect_lang import detect_if_chinese, detect_main_language
+from .device import pick_device
+from .format import (
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    load_json,
+    pack_history_conversations,
+    split_string_by_multi_markers,
+    write_json,
+)
+from .hash import (
+    compute_args_hash,
+    compute_content_hash,
+    compute_dict_hash,
+    compute_mm_hash,
+)
 from .help_nltk import NLTKHelper
+from .log import CURRENT_LOGGER_VAR, logger, set_logger
+from .loop import create_event_loop
+from .run_concurrent import run_concurrent
+from .wrap import async_to_sync_method
diff --git a/graphgen/utils/calculate_confidence.py b/graphgen/utils/calculate_confidence.py
index 1b596d91..9a055cc2 100644
--- a/graphgen/utils/calculate_confidence.py
+++ b/graphgen/utils/calculate_confidence.py
@@ -1,34 +1,41 @@
 import math
-from typing import List
-from graphgen.models.llm.topk_token_model import Token
+from typing import Dict, List
+
+from graphgen.bases.datatypes import Token
+
 
 def preprocess_tokens(tokens: List[Token]) -> List[Token]:
     """Preprocess tokens for calculating confidence."""
     tokens = [x for x in tokens if x.prob > 0]
     return tokens
 
+
 def joint_probability(tokens: List[Token]) -> float:
     """Calculate joint probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
     logprob_sum = sum(x.logprob for x in tokens)
     return math.exp(logprob_sum / len(tokens))
 
+
 def min_prob(tokens: List[Token]) -> float:
     """Calculate the minimum probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
     return min(x.prob for x in tokens)
 
+
 def average_prob(tokens: List[Token]) -> float:
     """Calculate the average probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
     return sum(x.prob for x in tokens) / len(tokens)
 
+
 def average_confidence(tokens: List[Token]) -> float:
     """Calculate the average confidence of a list of tokens."""
     tokens = preprocess_tokens(tokens)
     confidence = [x.prob / sum(y.prob for y in x.top_candidates[:5]) for x in tokens]
     return sum(confidence) / len(tokens)
 
+
 def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> float:
     """Calculate the loss for yes/no question."""
     losses = []
@@ -41,14 +48,112 @@ def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> floa
             losses.append(token.prob)
     return sum(losses) / len(losses)
 
-def yes_no_loss_entropy(tokens_list: List[List[Token]], ground_truth: List[str]) -> float:
+
+def _normalize_yes_no(tokens: List[Token]) -> Dict[str, float]:
+    """
+    Mapping yes/no synonyms to their probabilities and normalizing.
+    For example, given tokens with probabilities:
+    - "yes" (0.6)
+    - "yeah" (0.2)
+    - "no" (0.1)
+    - "nope" (0.1)
+    The function will return:
+    {"yes": 0.8, "no": 0.2}
+    Among them, "yes" and "yeah" are synonyms for "yes",
+    while "no" and "nope" are synonyms for "no".
+    If no "yes" or "no" synonyms are present, it will be judged as uncertain.
+    An uncertain result will also be considered as opposite to the ground truth.
+    """
+    yes_syno = {
+        # English yes synonyms
+        "yes",
+        "yeah",
+        "yea",
+        "yep",
+        "yup",
+        "yay",
+        "ya",
+        "yah",
+        "sure",
+        "certainly",
+        "absolutely",
+        "definitely",
+        "exactly",
+        "indeed",
+        "right",
+        "correct",
+        "true",
+        "t",
+        "1",
+        # Chinese yes synonyms
+        "是",
+        "对",
+        "好的",
+        "行",
+        "可以",
+        "没错",
+        "当然",
+        "确实",
+        "正确",
+        "真",
+        "对的",
+    }
+    no_syno = {
+        # English no synonyms
+        "no",
+        "nope",
+        "nop",
+        "nah",
+        "naw",
+        "na",
+        "negative",
+        "never",
+        "not",
+        "false",
+        "f",
+        "0",
+        # Chinese no synonyms
+        "不",
+        "不是",
+        "没有",
+        "错",
+        "不对",
+        "不行",
+        "不能",
+        "否",
+        "假的",
+    }
+
+    yes_prob = 0.0
+    no_prob = 0.0
+    uncertain_prob = 0.0
+    for tok in tokens:
+        t = tok.text.lower().strip()
+        if t in yes_syno:
+            yes_prob += tok.prob
+        elif t in no_syno:
+            no_prob += tok.prob
+        else:
+            uncertain_prob += tok.prob
+
+    total = yes_prob + no_prob + uncertain_prob
+
+    return {
+        "yes": yes_prob / total,
+        "no": no_prob / total,
+        "uncertain": uncertain_prob / total,
+    }
+
+
+def yes_no_loss_entropy(
+    tokens_list: List[List[Token]], ground_truth: List[str]
+) -> float:
     """Calculate the loss for yes/no question using entropy."""
     losses = []
-    for i, tokens in enumerate(tokens_list):
-        token = tokens[0]
-        assert token.text.lower() in ["yes", "no"]
-        if token.text == ground_truth[i]:
-            losses.append(-math.log(token.prob))
-        else:
-            losses.append(-math.log(1 - token.prob))
+    for toks, gt in zip(tokens_list, ground_truth):
+        dist = _normalize_yes_no(toks)
+        gt = gt.lower()
+        assert gt in {"yes", "no"}
+        prob_correct = dist[gt]
+        losses.append(-math.log(prob_correct))
     return sum(losses) / len(losses)
diff --git a/graphgen/utils/detect_lang.py b/graphgen/utils/detect_lang.py
index c34ddac4..944ccb2a 100644
--- a/graphgen/utils/detect_lang.py
+++ b/graphgen/utils/detect_lang.py
@@ -1,40 +1,41 @@
 def detect_main_language(text):
     """
-    识别文本的主要语言
+    Detect the main language of the text, 'zh' for Chinese, 'en' for English
 
     :param text:
     :return:
     """
     assert isinstance(text, str)
+
     def is_chinese_char(char):
-        return '\u4e00' <= char <= '\u9fff'
+        return "\u4e00" <= char <= "\u9fff"
 
     def is_english_char(char):
         return char.isascii() and char.isalpha()
 
-    # 去除空格和标点符号
-    text = ''.join(char for char in text if char.strip())
+    text = "".join(char for char in text if char.strip())
 
     chinese_count = sum(1 for char in text if is_chinese_char(char))
     english_count = sum(1 for char in text if is_english_char(char))
 
     total = chinese_count + english_count
     if total == 0:
-        return 'en'
+        return "en"
 
     chinese_ratio = chinese_count / total
 
     if chinese_ratio >= 0.5:
-        return 'zh'
-    return 'en'
+        return "zh"
+    return "en"
+
 
 def detect_if_chinese(text):
     """
-    判断文本是否包含有中文
+    Detect if the text contains any Chinese characters
 
     :param text:
     :return:
     """
 
     assert isinstance(text, str)
-    return any('\u4e00' <= char <= '\u9fff' for char in text)
+    return any("\u4e00" <= char <= "\u9fff" for char in text)
diff --git a/graphgen/utils/device.py b/graphgen/utils/device.py
new file mode 100644
index 00000000..1e5d8642
--- /dev/null
+++ b/graphgen/utils/device.py
@@ -0,0 +1,44 @@
+import shutil
+import subprocess
+import sys
+
+
+def pick_device() -> str:
+    """Return the best available device string for MinerU."""
+    # 1. NVIDIA GPU
+    if shutil.which("nvidia-smi") is not None:
+        try:
+            # check if there's any free GPU memory
+            out = subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=memory.free",
+                    "--format=csv,noheader,nounits",
+                ],
+                text=True,
+            )
+            if any(int(line) > 0 for line in out.strip().splitlines()):
+                return "cuda:0"
+        except Exception:  # pylint: disable=broad-except
+            pass
+
+    # 2. Apple Silicon
+    if sys.platform == "darwin" and shutil.which("sysctl"):
+        try:
+            brand = subprocess.check_output(
+                ["sysctl", "-n", "machdep.cpu.brand_string"], text=True
+            )
+            if "Apple" in brand:
+                return "mps"
+        except Exception:  # pylint: disable=broad-except
+            pass
+
+    # 3. Ascend NPU
+    if shutil.which("npu-smi") is not None:
+        try:
+            subprocess.check_call(["npu-smi", "info"], stdout=subprocess.DEVNULL)
+            return "npu"
+        except Exception:  # pylint: disable=broad-except
+            pass
+
+    return "cpu"
diff --git a/graphgen/utils/format.py b/graphgen/utils/format.py
index 0a0c101d..1f0675f1 100644
--- a/graphgen/utils/format.py
+++ b/graphgen/utils/format.py
@@ -1,16 +1,17 @@
-import re
-import os
-import json
 import html
-
+import json
+import os
+import re
 from typing import Any
 
+
 def pack_history_conversations(*args: str):
     roles = ["user", "assistant"]
     return [
         {"role": roles[i % 2], "content": content} for i, content in enumerate(args)
     ]
 
+
 def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]:
     """Split a string by multiple markers"""
     if not markers:
@@ -18,6 +19,7 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
     results = re.split("|".join(re.escape(marker) for marker in markers), content)
     return [r.strip() for r in results if r.strip()]
 
+
 # Refer the utils functions of the official GraphRAG implementation:
 # https://github.com/microsoft/graphrag
 def clean_str(input: Any) -> str:
@@ -30,6 +32,7 @@ def clean_str(input: Any) -> str:
     # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
     return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
 
+
 async def handle_single_entity_extraction(
     record_attributes: list[str],
     chunk_key: str,
@@ -50,9 +53,11 @@ async def handle_single_entity_extraction(
         "source_id": entity_source_id,
     }
 
+
 def is_float_regex(value):
     return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
 
+
 async def handle_single_relationship_extraction(
     record_attributes: list[str],
     chunk_key: str,
@@ -72,12 +77,14 @@ async def handle_single_relationship_extraction(
         "source_id": edge_source_id,
     }
 
+
 def load_json(file_name):
     if not os.path.exists(file_name):
         return None
     with open(file_name, encoding="utf-8") as f:
         return json.load(f)
 
+
 def write_json(json_obj, file_name):
     if not os.path.exists(os.path.dirname(file_name)):
         os.makedirs(os.path.dirname(file_name), exist_ok=True)
diff --git a/graphgen/utils/hash.py b/graphgen/utils/hash.py
index bf93ec5f..04ba96e7 100644
--- a/graphgen/utils/hash.py
+++ b/graphgen/utils/hash.py
@@ -1,7 +1,28 @@
 from hashlib import md5
 
+
 def compute_args_hash(*args):
     return md5(str(args).encode()).hexdigest()
 
+
 def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
+
+
+def compute_mm_hash(item, prefix: str = ""):
+    if item.get("type") == "text" and item.get("text"):
+        content = item["text"].strip()
+    elif item.get("type") == "image" and item.get("img_path"):
+        content = f"image:{item['img_path']}"
+    elif item.get("type") == "table" and item.get("table_body"):
+        content = f"table:{item['table_body']}"
+    elif item.get("type") == "equation" and item.get("text"):
+        content = f"equation:{item['text']}"
+    else:
+        content = str(item)
+    return prefix + md5(content.encode()).hexdigest()
+
+
+def compute_dict_hash(d: dict, prefix: str = ""):
+    items = tuple(sorted(d.items()))
+    return prefix + md5(str(items).encode()).hexdigest()
diff --git a/graphgen/utils/log.py b/graphgen/utils/log.py
index 32b9bac6..e29e994e 100644
--- a/graphgen/utils/log.py
+++ b/graphgen/utils/log.py
@@ -1,32 +1,101 @@
+import contextvars
 import logging
+import os
+from logging.handlers import RotatingFileHandler
+from typing import Any
 
-logger = logging.getLogger("graphgen")
+from rich.logging import RichHandler
 
-def set_logger(log_file: str, log_level: int = logging.INFO, if_stream: bool = True):
-    logger.setLevel(log_level)
 
-    formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
+def set_logger(
+    log_file: str,
+    name: str,
+    file_level: int = logging.DEBUG,
+    console_level: int = logging.INFO,
+    *,
+    if_stream: bool = True,
+    max_bytes: int = 50 * 1024 * 1024,  # 50 MB
+    backup_count: int = 5,
+    force: bool = False,
+):
+
+    current_logger = logging.getLogger(name)
+    if current_logger.hasHandlers() and not force:
+        return current_logger
+
+    if force:
+        current_logger.handlers.clear()
 
-    file_handler = logging.FileHandler(log_file, mode='w')
-    file_handler.setLevel(log_level)
-    file_handler.setFormatter(formatter)
+    current_logger.setLevel(
+        min(file_level, console_level)
+    )  # Set to the lowest level to capture all logs
+    current_logger.propagate = False
 
-    stream_handler = None
+    if log_file:
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
 
     if if_stream:
-        stream_handler = logging.StreamHandler()
-        stream_handler.setLevel(log_level)
-        stream_handler.setFormatter(formatter)
+        console = RichHandler(
+            level=console_level, show_path=False, rich_tracebacks=True
+        )
+        console.setFormatter(logging.Formatter("%(message)s"))
+        current_logger.addHandler(console)
+
+    file_handler = RotatingFileHandler(
+        log_file,
+        maxBytes=max_bytes,
+        backupCount=backup_count,
+        encoding="utf-8",
+    )
+    file_handler.setLevel(file_level)
+    file_handler.setFormatter(
+        logging.Formatter(
+            "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s",
+            datefmt="%y-%m-%d %H:%M:%S",
+        )
+    )
+    current_logger.addHandler(file_handler)
+    return current_logger
+
+
+CURRENT_LOGGER_VAR = contextvars.ContextVar("current_logger")
+
+
+def get_current_logger() -> logging.Logger:
+    current_logger = CURRENT_LOGGER_VAR.get()
+    if not current_logger:
+        raise RuntimeError("No logger is set in the current context.")
+    return current_logger
+
+
+class ContextAwareLogger:
+    @staticmethod
+    def _get_logger() -> logging.Logger:
+        return get_current_logger()
+
+    def debug(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().debug(msg, *args, **kwargs)
+
+    def info(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().info(msg, *args, **kwargs)
+
+    def warning(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().warning(msg, *args, **kwargs)
+
+    def error(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().error(msg, *args, **kwargs)
+
+    def exception(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().exception(msg, *args, **kwargs)
+
+    def critical(self, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().critical(msg, *args, **kwargs)
+
+    def log(self, level: int, msg: object, *args: Any, **kwargs: Any) -> None:
+        self._get_logger().log(level, msg, *args, **kwargs)
 
-    if not logger.handlers:
-        logger.addHandler(file_handler)
-        if if_stream and stream_handler:
-            logger.addHandler(stream_handler)
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._get_logger(), name)
 
 
-def parse_log(log_file: str):
-    with open(log_file, "r", encoding='utf-8') as f:
-        lines = f.readlines()
-    return lines
+logger = ContextAwareLogger()
diff --git a/graphgen/utils/loop.py b/graphgen/utils/loop.py
index 5f12fa5b..f0ab7dfd 100644
--- a/graphgen/utils/loop.py
+++ b/graphgen/utils/loop.py
@@ -1,9 +1,10 @@
 import asyncio
+from typing import Tuple
 
 from .log import logger
 
 
-def create_event_loop() -> asyncio.AbstractEventLoop:
+def create_event_loop() -> Tuple[asyncio.AbstractEventLoop, bool]:
     """
     Ensure that there is always an event loop available.
 
@@ -11,18 +12,25 @@ def create_event_loop() -> asyncio.AbstractEventLoop:
     it creates a new event loop and sets it as the current event loop.
 
     Returns:
-        asyncio.AbstractEventLoop: The current or newly created event loop.
+        Tuple[asyncio.AbstractEventLoop, bool]: The event loop and a flag
+        indicating if we created it (True) or it was already running (False).
     """
     try:
-        # Try to get the current event loop
-        current_loop = asyncio.get_event_loop()
-        if current_loop.is_closed():
-            raise RuntimeError("Event loop is closed.")
-        return current_loop
-
+        # Try to get the running event loop (Python 3.7+)
+        running_loop = asyncio.get_running_loop()
+        # If we get here, there's already a running loop
+        return running_loop, False
     except RuntimeError:
-        # If no event loop exists or it is closed, create a new one
-        logger.info("Creating a new event loop in main thread.")
-        new_loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(new_loop)
-        return new_loop
+        # No running loop, try to get the current event loop
+        try:
+            current_loop = asyncio.get_event_loop()
+            if current_loop.is_closed():
+                raise RuntimeError("Event loop is closed.") from None
+            # Loop exists but not running, we can use it
+            return current_loop, False
+        except RuntimeError:
+            # No event loop exists, create a new one
+            logger.info("Creating a new event loop in main thread.")
+            new_loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(new_loop)
+            return new_loop, True
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
new file mode 100644
index 00000000..8315b953
--- /dev/null
+++ b/graphgen/utils/run_concurrent.py
@@ -0,0 +1,122 @@
+import asyncio
+from typing import Awaitable, Callable, List, Optional, TypeVar, Union
+
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+
+from graphgen.utils.log import logger
+
+from .loop import create_event_loop
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def run_concurrent(
+    coro_fn: Callable[[T], Awaitable[R]],
+    items: List[T],
+    *,
+    desc: str = "processing",
+    unit: str = "item",
+    progress_bar: Optional[gr.Progress] = None,
+    save_interval: int = 0,
+    save_callback: Optional[Callable[[List[R], int], None]] = None,
+    max_concurrent: Optional[int] = None,
+) -> Union[List[R], Awaitable[List[R]]]:
+    """
+    Run coroutines concurrently with optional periodic saving.
+    This function can be used in both sync and async contexts:
+    - In sync context: returns List[R] directly
+    - In async context: returns Awaitable[List[R]] (use with 'await')
+    :return: List of results (in sync context) or coroutine (in async context)
+    """
+    async def _run_all():
+        if not items:
+            return []
+        # Use semaphore to limit concurrent tasks if max_concurrent is specified
+        semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent is not None and max_concurrent > 0 else None
+
+        async def run_with_semaphore(item: T) -> R:
+            """Wrapper to apply semaphore if needed."""
+            if semaphore:
+                async with semaphore:
+                    return await coro_fn(item)
+            else:
+                return await coro_fn(item)
+
+        # Create tasks with concurrency limit
+        if max_concurrent is not None and max_concurrent > 0:
+            # Use semaphore-controlled wrapper
+            tasks = [asyncio.create_task(run_with_semaphore(it)) for it in items]
+        else:
+            # Original behavior: create all tasks at once
+            tasks = [asyncio.create_task(coro_fn(it)) for it in items]
+
+        completed_count = 0
+        results = []
+        pending_save_results = []
+        pbar = tqdm_async(total=len(items), desc=desc, unit=unit)
+
+        for future in asyncio.as_completed(tasks):
+            try:
+                result = await future
+                results.append(result)
+                if save_interval > 0 and save_callback is not None:
+                    pending_save_results.append(result)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.exception("Task failed: %s", e)
+                # even if failed, record it to keep results consistent with tasks
+                results.append(e)
+
+            completed_count += 1
+            pbar.update(1)
+
+            if progress_bar is not None:
+                progress = completed_count / len(items)
+                progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})")
+
+            # Periodic save
+            if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0:
+                try:
+                    # Filter out exceptions before saving
+                    valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+                    save_callback(valid_results, completed_count)
+                    pending_save_results = []  # Clear after saving
+                    logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items))
+                except Exception as e:
+                    logger.warning("Failed to save intermediate results: %s", e)
+
+        pbar.close()
+
+        if progress_bar is not None:
+            progress_bar(1.0, desc=f"{desc} (completed)")
+
+        # Save remaining results if any
+        if save_interval > 0 and save_callback is not None and pending_save_results:
+            try:
+                valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+                save_callback(valid_results, completed_count)
+                logger.info("Saved final intermediate results: %d completed", completed_count)
+            except Exception as e:
+                logger.warning("Failed to save final intermediate results: %s", e)
+
+        # filter out exceptions
+        return [res for res in results if not isinstance(res, Exception)]
+
+    # Check if we're in an async context (event loop is running)
+    try:
+        _ = asyncio.get_running_loop()
+        # If we're in an async context, return the coroutine directly
+        # The caller should use 'await run_concurrent(...)'
+        return _run_all()
+    except RuntimeError:
+        # No running loop, we can create one and run until complete
+        if not items:
+            return []
+        loop, created = create_event_loop()
+        try:
+            return loop.run_until_complete(_run_all())
+        finally:
+            # Only close the loop if we created it
+            if created:
+                loop.close()
diff --git a/graphgen/utils/wrap.py b/graphgen/utils/wrap.py
new file mode 100644
index 00000000..9689cea6
--- /dev/null
+++ b/graphgen/utils/wrap.py
@@ -0,0 +1,21 @@
+from functools import wraps
+from typing import Any, Callable
+
+from .loop import create_event_loop
+
+
+def async_to_sync_method(func: Callable) -> Callable:
+    @wraps(func)
+    def wrapper(self, *args, **kwargs) -> Any:
+        loop, created = create_event_loop()
+        try:
+            if loop.is_running():
+                raise RuntimeError(
+                    "Cannot use async_to_sync_method when event loop is already running."
+                )
+            return loop.run_until_complete(func(self, *args, **kwargs))
+        finally:
+            if created:
+                loop.close()
+
+    return wrapper
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..2c6b870e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[tool.black]
+line-length = 88                  # black 默认 88
+include = '\.pyi?$'
+
+[tool.isort]
+profile = "black"                 # 一键适配 black
+line_length = 88                  # 与 black 保持一致
+multi_line_output = 3             # black 偏好的括号换行风格
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 00000000..55b033e9
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1 @@
+pytest
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ab329cb5..44079ab5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,8 +12,30 @@ nltk
 jieba
 plotly
 pandas
-gradio>=5.25.0
-gradio-i18n==0.3.0
+gradio==5.44.1
 kaleido
 pyyaml
 langcodes
+requests
+fastapi
+trafilatura
+aiohttp
+socksio
+pydantic
+ray==2.52.1
+
+leidenalg
+igraph
+python-louvain
+
+# storage
+rocksdict
+
+# KG
+rdflib
+
+# Bioinformatics
+biopython
+
+# For visualization
+matplotlib
diff --git a/resources/examples/chunked_demo.json b/resources/examples/chunked_demo.json
deleted file mode 100644
index ad7219a3..00000000
--- a/resources/examples/chunked_demo.json
+++ /dev/null
@@ -1,14 +0,0 @@
-[
-  [
-    {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
-  ],
-  [
-    {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
-  ],
-  [
-    {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
-  ],
-  [
-    {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
-  ]
-]
diff --git a/resources/examples/raw_demo.jsonl b/resources/examples/raw_demo.jsonl
deleted file mode 100644
index 024559a6..00000000
--- a/resources/examples/raw_demo.jsonl
+++ /dev/null
@@ -1,4 +0,0 @@
-{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
-{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
-{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
-{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
diff --git a/resources/images/flow.png b/resources/images/flow.png
deleted file mode 100644
index 9800a7b0..00000000
Binary files a/resources/images/flow.png and /dev/null differ
diff --git a/resources/images/logo.png b/resources/images/logo.png
deleted file mode 100644
index 1acc963d..00000000
Binary files a/resources/images/logo.png and /dev/null differ
diff --git a/resources/nltk_data/corpora/stopwords/chinese b/resources/nltk_data/corpora/stopwords/chinese
deleted file mode 100644
index 0873a903..00000000
--- a/resources/nltk_data/corpora/stopwords/chinese
+++ /dev/null
@@ -1,841 +0,0 @@
-一
-一下
-一些
-一切
-一则
-一天
-一定
-一方面
-一旦
-一时
-一来
-一样
-一次
-一片
-一直
-一致
-一般
-一起
-一边
-一面
-万一
-上下
-上升
-上去
-上来
-上述
-上面
-下列
-下去
-下来
-下面
-不一
-不久
-不仅
-不会
-不但
-不光
-不单
-不变
-不只
-不可
-不同
-不够
-不如
-不得
-不怕
-不惟
-不成
-不拘
-不敢
-不断
-不是
-不比
-不然
-不特
-不独
-不管
-不能
-不要
-不论
-不足
-不过
-不问
-与
-与其
-与否
-与此同时
-专门
-且
-两者
-严格
-严重
-个
-个人
-个别
-中小
-中间
-丰富
-临
-为
-为主
-为了
-为什么
-为什麽
-为何
-为着
-主张
-主要
-举行
-乃
-乃至
-么
-之
-之一
-之前
-之后
-之後
-之所以
-之类
-乌乎
-乎
-乘
-也
-也好
-也是
-也罢
-了
-了解
-争取
-于
-于是
-于是乎
-云云
-互相
-产生
-人们
-人家
-什么
-什么样
-什麽
-今后
-今天
-今年
-今後
-仍然
-从
-从事
-从而
-他
-他人
-他们
-他的
-代替
-以
-以上
-以下
-以为
-以便
-以免
-以前
-以及
-以后
-以外
-以後
-以来
-以至
-以至于
-以致
-们
-任
-任何
-任凭
-任务
-企图
-伟大
-似乎
-似的
-但
-但是
-何
-何况
-何处
-何时
-作为
-你
-你们
-你的
-使得
-使用
-例如
-依
-依照
-依靠
-促进
-保持
-俺
-俺们
-倘
-倘使
-倘或
-倘然
-倘若
-假使
-假如
-假若
-做到
-像
-允许
-充分
-先后
-先後
-先生
-全部
-全面
-兮
-共同
-关于
-其
-其一
-其中
-其二
-其他
-其余
-其它
-其实
-其次
-具体
-具体地说
-具体说来
-具有
-再者
-再说
-冒
-冲
-决定
-况且
-准备
-几
-几乎
-几时
-凭
-凭借
-出去
-出来
-出现
-分别
-则
-别
-别的
-别说
-到
-前后
-前者
-前进
-前面
-加之
-加以
-加入
-加强
-十分
-即
-即令
-即使
-即便
-即或
-即若
-却不
-原来
-又
-及
-及其
-及时
-及至
-双方
-反之
-反应
-反映
-反过来
-反过来说
-取得
-受到
-变成
-另
-另一方面
-另外
-只是
-只有
-只要
-只限
-叫
-叫做
-召开
-叮咚
-可
-可以
-可是
-可能
-可见
-各
-各个
-各人
-各位
-各地
-各种
-各级
-各自
-合理
-同
-同一
-同时
-同样
-后来
-后面
-向
-向着
-吓
-吗
-否则
-吧
-吧哒
-吱
-呀
-呃
-呕
-呗
-呜
-呜呼
-呢
-周围
-呵
-呸
-呼哧
-咋
-和
-咚
-咦
-咱
-咱们
-咳
-哇
-哈
-哈哈
-哉
-哎
-哎呀
-哎哟
-哗
-哟
-哦
-哩
-哪
-哪个
-哪些
-哪儿
-哪天
-哪年
-哪怕
-哪样
-哪边
-哪里
-哼
-哼唷
-唉
-啊
-啐
-啥
-啦
-啪达
-喂
-喏
-喔唷
-嗡嗡
-嗬
-嗯
-嗳
-嘎
-嘎登
-嘘
-嘛
-嘻
-嘿
-因
-因为
-因此
-因而
-固然
-在
-在下
-地
-坚决
-坚持
-基本
-处理
-复杂
-多
-多少
-多数
-多次
-大力
-大多数
-大大
-大家
-大批
-大约
-大量
-失去
-她
-她们
-她的
-好的
-好象
-如
-如上所述
-如下
-如何
-如其
-如果
-如此
-如若
-存在
-宁
-宁可
-宁愿
-宁肯
-它
-它们
-它们的
-它的
-安全
-完全
-完成
-实现
-实际
-宣布
-容易
-密切
-对
-对于
-对应
-将
-少数
-尔后
-尚且
-尤其
-就
-就是
-就是说
-尽
-尽管
-属于
-岂但
-左右
-巨大
-巩固
-己
-已经
-帮助
-常常
-并
-并不
-并不是
-并且
-并没有
-广大
-广泛
-应当
-应用
-应该
-开外
-开始
-开展
-引起
-强烈
-强调
-归
-当
-当前
-当时
-当然
-当着
-形成
-彻底
-彼
-彼此
-往
-往往
-待
-後来
-後面
-得
-得出
-得到
-心里
-必然
-必要
-必须
-怎
-怎么
-怎么办
-怎么样
-怎样
-怎麽
-总之
-总是
-总的来看
-总的来说
-总的说来
-总结
-总而言之
-恰恰相反
-您
-意思
-愿意
-慢说
-成为
-我
-我们
-我的
-或
-或是
-或者
-战斗
-所
-所以
-所有
-所谓
-打
-扩大
-把
-抑或
-拿
-按
-按照
-换句话说
-换言之
-据
-掌握
-接着
-接著
-故
-故此
-整个
-方便
-方面
-旁人
-无宁
-无法
-无论
-既
-既是
-既然
-时候
-明显
-明确
-是
-是否
-是的
-显然
-显著
-普通
-普遍
-更加
-曾经
-替
-最后
-最大
-最好
-最後
-最近
-最高
-有
-有些
-有关
-有利
-有力
-有所
-有效
-有时
-有点
-有的
-有着
-有著
-望
-朝
-朝着
-本
-本着
-来
-来着
-极了
-构成
-果然
-果真
-某
-某个
-某些
-根据
-根本
-欢迎
-正在
-正如
-正常
-此
-此外
-此时
-此间
-毋宁
-每
-每个
-每天
-每年
-每当
-比
-比如
-比方
-比较
-毫不
-没有
-沿
-沿着
-注意
-深入
-清楚
-满足
-漫说
-焉
-然则
-然后
-然後
-然而
-照
-照着
-特别是
-特殊
-特点
-现代
-现在
-甚么
-甚而
-甚至
-用
-由
-由于
-由此可见
-的
-的话
-目前
-直到
-直接
-相似
-相信
-相反
-相同
-相对
-相对而言
-相应
-相当
-相等
-省得
-看出
-看到
-看来
-看看
-看见
-真是
-真正
-着
-着呢
-矣
-知道
-确定
-离
-积极
-移动
-突出
-突然
-立即
-第
-等
-等等
-管
-紧接着
-纵
-纵令
-纵使
-纵然
-练习
-组成
-经
-经常
-经过
-结合
-结果
-给
-绝对
-继续
-继而
-维持
-综上所述
-罢了
-考虑
-者
-而
-而且
-而况
-而外
-而已
-而是
-而言
-联系
-能
-能否
-能够
-腾
-自
-自个儿
-自从
-自各儿
-自家
-自己
-自身
-至
-至于
-良好
-若
-若是
-若非
-范围
-莫若
-获得
-虽
-虽则
-虽然
-虽说
-行为
-行动
-表明
-表示
-被
-要
-要不
-要不是
-要不然
-要么
-要是
-要求
-规定
-觉得
-认为
-认真
-认识
-让
-许多
-论
-设使
-设若
-该
-说明
-诸位
-谁
-谁知
-赶
-起
-起来
-起见
-趁
-趁着
-越是
-跟
-转动
-转变
-转贴
-较
-较之
-边
-达到
-迅速
-过
-过去
-过来
-运用
-还是
-还有
-这
-这个
-这么
-这么些
-这么样
-这么点儿
-这些
-这会儿
-这儿
-这就是说
-这时
-这样
-这点
-这种
-这边
-这里
-这麽
-进入
-进步
-进而
-进行
-连
-连同
-适应
-适当
-适用
-逐步
-逐渐
-通常
-通过
-造成
-遇到
-遭到
-避免
-那
-那个
-那么
-那么些
-那么样
-那些
-那会儿
-那儿
-那时
-那样
-那边
-那里
-那麽
-部分
-鄙人
-采取
-里面
-重大
-重新
-重要
-鉴于
-问题
-防止
-阿
-附近
-限制
-除
-除了
-除此之外
-除非
-随
-随着
-随著
-集中
-需要
-非但
-非常
-非徒
-靠
-顺
-顺着
-首先
-高兴
-是不是
diff --git a/resources/nltk_data/corpora/stopwords/english b/resources/nltk_data/corpora/stopwords/english
deleted file mode 100644
index 1280aa3b..00000000
--- a/resources/nltk_data/corpora/stopwords/english
+++ /dev/null
@@ -1,179 +0,0 @@
-i
-me
-my
-myself
-we
-our
-ours
-ourselves
-you
-you're
-you've
-you'll
-you'd
-your
-yours
-yourself
-yourselves
-he
-him
-his
-himself
-she
-she's
-her
-hers
-herself
-it
-it's
-its
-itself
-they
-them
-their
-theirs
-themselves
-what
-which
-who
-whom
-this
-that
-that'll
-these
-those
-am
-is
-are
-was
-were
-be
-been
-being
-have
-has
-had
-having
-do
-does
-did
-doing
-a
-an
-the
-and
-but
-if
-or
-because
-as
-until
-while
-of
-at
-by
-for
-with
-about
-against
-between
-into
-through
-during
-before
-after
-above
-below
-to
-from
-up
-down
-in
-out
-on
-off
-over
-under
-again
-further
-then
-once
-here
-there
-when
-where
-why
-how
-all
-any
-both
-each
-few
-more
-most
-other
-some
-such
-no
-nor
-not
-only
-own
-same
-so
-than
-too
-very
-s
-t
-can
-will
-just
-don
-don't
-should
-should've
-now
-d
-ll
-m
-o
-re
-ve
-y
-ain
-aren
-aren't
-couldn
-couldn't
-didn
-didn't
-doesn
-doesn't
-hadn
-hadn't
-hasn
-hasn't
-haven
-haven't
-isn
-isn't
-ma
-mightn
-mightn't
-mustn
-mustn't
-needn
-needn't
-shan
-shan't
-shouldn
-shouldn't
-wasn
-wasn't
-weren
-weren't
-won
-won't
-wouldn
-wouldn't
diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt b/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt
deleted file mode 100644
index b38862b8..00000000
--- a/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-ct
-m.j
-t
-a.c
-n.h
-ms
-p.a.m
-dr
-pa
-p.m
-u.k
-st
-dec
-u.s.a
-lt
-g.k
-adm
-p
-h.m
-ga
-tenn
-yr
-sen
-n.c
-j.j
-d.h
-s.g
-inc
-vs
-s.p.a
-a.t
-n
-feb
-sr
-jan
-s.a.y
-n.y
-col
-g.f
-c.o.m.b
-d
-ft
-va
-r.k
-e.f
-chg
-r.i
-a.g
-minn
-a.h
-k
-n.j
-m
-l.f
-f.j
-gen
-i.m.s
-s.a
-aug
-j.p
-okla
-m.d.c
-ltd
-oct
-s
-vt
-r.a
-j.c
-ariz
-w.w
-b.v
-ore
-h
-w.r
-e.h
-mrs
-cie
-corp
-w
-n.v
-a.d
-r.j
-ok
-. . 
-e.m
-w.c
-ill
-nov
-u.s
-prof
-conn
-u.s.s.r
-mg
-f.g
-ph.d
-g
-calif
-messrs
-h.f
-wash
-tues
-sw
-bros
-u.n
-l
-wis
-mr
-sep
-d.c
-ave
-e.l
-co
-s.s
-reps
-c
-r.t
-h.c
-r
-wed
-a.s
-v
-fla
-jr
-r.h
-c.v
-m.b.a
-rep
-a.a
-e
-c.i.t
-l.a
-b.f
-j.b
-d.w
-j.k
-ala
-f
-w.va
-sept
-mich
-n.m
-j.r
-l.p
-s.c
-colo
-fri
-a.m
-g.d
-kan
-maj
-ky
-a.m.e
-n.d
-t.j
-cos
-nev
\ No newline at end of file
diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab b/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab
deleted file mode 100644
index c7032b7f..00000000
--- a/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab
+++ /dev/null
@@ -1,37 +0,0 @@
-##number##	international
-##number##	rj
-##number##	commodities
-##number##	cooper
-b	stewart
-##number##	genentech
-##number##	wedgestone
-i	toussie
-##number##	pepper
-j	fialka
-o	ludcke
-##number##	insider
-##number##	aes
-i	magnin
-##number##	credit
-##number##	corrections
-##number##	financing
-##number##	henley
-##number##	business
-##number##	pay-fone
-b	wigton
-b	edelman
-b	levine
-##number##	leisure
-b	smith
-j	walter
-##number##	pegasus
-##number##	dividend
-j	aron
-##number##	review
-##number##	abreast
-##number##	who
-##number##	letters
-##number##	colgate
-##number##	cbot
-##number##	notable
-##number##	zimmer
\ No newline at end of file
diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab b/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab
deleted file mode 100644
index a33a3648..00000000
--- a/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab
+++ /dev/null
@@ -1,20366 +0,0 @@
-coverage	36
-springsteen	4
-sandoz	4
-board-room	32
-eastman	6
-enter	32
-procedure	36
-blowing	32
-competition	46
-sinai	4
-t-word	4
-entangled	32
-clines	12
-seasonally	34
-firing	32
-legal-trade	32
-disruptive	32
-wagner	12
-soften	32
-ex-grenfell	4
-cementing	32
-threats	32
-sleepers	32
-bulgarian	4
-him	96
-citing	42
-phelps-stokes	4
-oglesby	12
-dieter	4
-showman	32
-half-hour	32
-deliveries	32
-monetary	110
-counters	32
-hill	12
-exceed	32
-miscreants	32
-poisoning	32
-neutral	32
-allocated	32
-angry	32
-registers	32
-chose	32
-cardinal	4
-also	110
-magnin	12
-vernacular	32
-rebound	32
-penicillin	32
-behave	36
-restrained	32
-diver	36
-journalists	34
-pops	32
-x-ray	4
-mansion	32
-overhauling	32
-merely	32
-julian	2
-veer	32
-fueled	36
-affordable	32
-amos-lee	4
-longstanding	32
-two-part	32
-warrants	36
-text	32
-aetna	10
-recollections	32
-entirely	32
-washing	32
-mathematical	32
-improvement	32
-reexamined	32
-therefore	34
-month	38
-mover	32
-agreement	36
-shoppers	32
-arabian	4
-cyclical	32
-creepers	32
-solicitation	32
-sculptures	32
-troupe	36
-break-even	32
-expressively	32
-indications	34
-kremlin	4
-czech	4
-darts	4
-worksheets	32
-secondary	40
-diplomat	32
-prescriptions	32
-cream	32
-devote	32
-repertoire	32
-berton	4
-waiting	32
-sculptured	32
-energy-related	32
-amplify	32
-apologetic	32
-indelible	32
-piggyback	32
-dwarfism	32
-respectively	32
-ahp	12
-bis	4
-wkr	8
-bit	32
-blew	32
-pgulf	8
-property-casualty	32
-banded	32
-interviewed	32
-expatriate	32
-branches	32
-superintendent	32
-stinginess	32
-bid	36
-big	46
-enigmatic	32
-attacked	32
-corporate	110
-southeastern	4
-one-fourth	32
-jacobs	4
-entertaining	32
-sbo	12
-sharpest	32
-sbc	4
-pest-resistant	32
-beating	32
-berkowitz	4
-serious	36
-skepticism	32
-shelf	32
-revisions	32
-kravis	4
-ellsworth	4
-intimidate	32
-pearson	6
-swarming	32
-bureaucracy	32
-pmi	8
-final	34
-speculative	32
-violence	36
-chocolates	44
-purists	2
-shapiro	12
-distillates	32
-saatchi	6
-temptation	32
-immigration	36
-drooping	32
-walk	32
-publishers	6
-waxing	32
-assembles	32
-reimbursed	32
-peculiar	32
-butter	32
-anywhere	32
-hoots	32
-cavalier	4
-banxquote	4
-turkish	4
-buried	34
-contributed	32
-intermittent	32
-omniscient	32
-appalled	32
-moved	32
-electromagnetism	32
-fancy	32
-sorts	36
-uncertainty	32
-front	36
-mcneill	4
-plainly	32
-firmly	32
-relations	38
-plasma	32
-enduring	32
-sayer	4
-stumbling	32
-exit	36
-non-refundable	32
-regan	12
-refrigerators	32
-pieter	4
-recipes	32
-doubtful	32
-sue	36
-undercut	32
-patent	44
-liberalization	32
-alvin	4
-progress	38
-brothers	36
-leaves	36
-revert	32
-lover	34
-handmade	32
-deceiving	32
-kessler	12
-ropak	6
-baytown	4
-attacks	32
-juncture	4
-martha	4
-questioning	32
-loved	32
-wiedemann	8
-acquire	36
-falsified	32
-corrigan	4
-romanticized	32
-stealing	32
-intel	12
-recruited	32
-calculating	32
-berger	4
-quantify	32
-gifts	32
-exuding	32
-bigger	32
-evaluate	32
-distributor	32
-restraining	34
-highlights	40
-madness	32
-budget/tax	32
-popularizing	32
-leningrad	6
-bread-and-butter	32
-groups	36
-leaders	34
-amadou-mahtar	2
-non-profit	32
-ivan	4
-wicklow	4
-logo	32
-devoting	32
-teach	32
-luis	4
-bigger-than-expected	32
-credentials	32
-conglomerate	32
-talmudic	4
-banning	2
-steelmaker	40
-ingratiate	32
-anguish	32
-handelman	4
-clymer	8
-acid-rain	96
-your	42
-concluded	32
-atco	8
-calmly	32
-pleads	32
-restart	32
-swimming	32
-boyish	32
-batchelder	12
-japan-style	4
-westport	4
-nominate	32
-perverse	32
-petrochemicals	32
-sunday	4
-trafficking	32
-epidemic	32
-curtailed	32
-rosenbaum	4
-exist	32
-submarine	32
-stepping	32
-lighthearted	32
-then-standard	32
-backwater	32
-religion	32
-plow	32
-colman	4
-nuclei	32
-plot	32
-full-hearted	32
-visit	38
-oriented	32
-abilities	32
-defeated	32
-ddb	6
-horse	32
-halts	32
-mistakes	32
-descended	32
-requesting	32
-bain	14
-reserved	32
-carolina	4
-allocates	32
-speeches	32
-horst	4
-ollmann	4
-russia	4
-basket	32
-wordperfect	4
-seating	32
-unrealized	32
-mistaken	32
-thoroughly	32
-mdest	12
-oldest	32
-mich.	4
-telephone-utility	32
-depot	4
-methodically	32
-talcs	32
-d.c.	4
-jason	2
-implicate	32
-models	36
-sorely	32
-barbecue	38
-triton	14
-bias	32
-subatomic	32
-modell	12
-window	32
-self-indulgent	32
-adelman	4
-complicate	32
-favorite	32
-can-sealing	32
-harvest	32
-boil	32
-warner	12
-miti	14
-tax-law	32
-welton	4
-commodity-chip	32
-student-loan	32
-dearly	32
-fiance	32
-restricting	34
-pick	32
-album	32
-topping	32
-hussmann	4
-fabulously	32
-razzmatazz	32
-skeptic	32
-withstand	32
-interfered	32
-plaza	4
-assertion	32
-e.f.ihutton	4
-diversify	32
-patents	32
-tack	32
-poor	38
-devising	32
-baldness	32
-commented	32
-cooperated	32
-personal-computer	32
-necessity	32
-vintage	10
-entitles	32
-hardymon	8
-elusiveness	32
-outgoing	34
-complacent	32
-keflex	4
-adding	34
-punitive	36
-hoechst	4
-percentage-point	32
-safety	36
-flannigan	4
-brakes	32
-collective	32
-mexican	6
-cours-la-reine	4
-frustrated	34
-anatomy	8
-distributes	32
-daring	32
-overpaid	32
-monty	4
-phaseout	32
-nationally	34
-moonlight	4
-primer	4
-surfeit	32
-distractions	32
-billet	4
-back-to-basics	32
-writ	32
-complying	32
-telephoned	32
-fence	32
-hearing	40
-billed	32
-drabness	32
-balding	32
-glamorous	32
-telephones	32
-adapted	32
-campaigner	32
-rubbed	32
-mandate	32
-economy	100
-breast	32
-spectra	4
-sealed	32
-achievements	32
-odds	32
-keyboard	32
-ambassador	32
-keener	32
-lyonnais	4
-registering	32
-ghetto	4
-apply	32
-bronx	4
-sogevalor	4
-hung	36
-sign	44
-plate	32
-sigh	32
-niles	8
-molten	32
-moves	36
-free-fall	32
-landmark	32
-raton	4
-accrediting	36
-investigations	32
-single-state	32
-disarray	32
-mccann-erickson	4
-pensions	32
-consumer-products	32
-horns	32
-bondholders	32
-plugged	32
-earner	32
-investment	102
-uni.a	12
-smallest	32
-ordinary	32
-ffod	4
-connecting	32
-snack	32
-iron-bending	32
-norman	14
-telco	14
-hmos	4
-repurchases	32
-kilowatt-hour	32
-advertise	32
-outfits	32
-chopin	4
-choreographers	32
-oliver	14
-emirates	4
-bestowed	32
-brink	32
-prize	36
-assuredly	32
-modification	32
-campanies	32
-beckoning	32
-princeton	12
-managing	32
-rebutted	32
-soaring	32
-veteran	32
-loudly	32
-tarrytown	4
-green	44
-flagging	32
-greek	6
-intricate	32
-georgia	6
-unprotected	32
-attain	32
-impromptu	32
-naive	32
-francs	32
-greene	4
-rigid	32
-requiring	32
-circulatory	32
-context	32
-million-dollar-a-year	32
-france	14
-greens	36
-browns	4
-accent	32
-pictures	36
-greer	4
-totals	40
-cosby	12
-peabody	12
-sapiens	32
-wafer	32
-program	34
-swelling	32
-cohesive	32
-prison	36
-recycling	32
-size	32
-quitting	32
-hot-strip	32
-troika	32
-strengthened	32
-hanff	4
-tavris	4
-unissued	32
-mental	32
-oscher	4
-fearsome	32
-arguello	4
-pre-game	32
-would	102
-packing	32
-schoenberg	4
-butler	12
-considering	34
-threshold	32
-persuasion	4
-lessons	32
-cordis	14
-unwind	32
-evades	32
-mascot	32
-joked	32
-telecom	14
-french	14
-thorns	4
-pre-selected	32
-constance	12
-jackman	4
-logical	32
-pressing	32
-competent	32
-bluhm	12
-scenario	32
-sold	100
-sprees	32
-incredible	32
-charities	32
-rigorous	32
-auto-receivables	64
-regains	32
-gradual	32
-dark-skinned	32
-consultations	32
-endurance	32
-s.a.y.	8
-drug-delivery	32
-japanese-style	4
-platinum	34
-insignificant	36
-moderate-income	32
-whatsoever	32
-sweaty	32
-interviewing	32
-raged	4
-myth	32
-household	38
-drawbacks	32
-bluhdorn	4
-twisted	32
-nicotine-stained	32
-conventions	32
-shoots	32
-place	36
-data	46
-pre-boesky	32
-lenses	32
-pre-reagan	32
-unnamed	32
-weaken	32
-experience	32
-dqu	8
-weaker	32
-kleinwort	6
-interest-rate	32
-horizon	32
-sole	32
-tweedledum	4
-commuted	32
-automatic-focus	32
-tension	32
-plows	32
-climbed	36
-poaching	32
-disregard	32
-ginger-haired	32
-flimsy	34
-moorman	8
-acknowledging	40
-aids-related	4
-kremlins	32
-invent	32
-sleeping	36
-koryagin	12
-low-profile	32
-niche	36
-climber	32
-moralists	32
-mcqueen	12
-efforts	102
-ribs	32
-overdraft	32
-swenson	8
-persistently	32
-appointed	32
-plasminogen	32
-cato	4
-harvey	12
-imported	32
-ammonium	32
-constructive	32
-coaxing	32
-progresses	32
-neurosurgical	32
-birds	32
-nazis	4
-offspring	32
-television	102
-donoghue	4
-weldon	8
-cliche	32
-documentation	32
-points	32
-dettmar	4
-half-dozen	32
-bor	8
-prefers	32
-underscores	32
-tranquilizer	32
-boy	32
-proving	34
-diplomacy	32
-rollbacks	32
-tomasson	12
-bob	38
-adult	32
-bon	4
-fuel	36
-microchips	40
-bragg	12
-traditions	32
-nigger	32
-supplant	32
-hope	36
-hutton	14
-probation	32
-adaptation	32
-sdi	4
-co-produced	32
-overreaction	32
-gillis	4
-friedman	12
-archaeologists	32
-quadrupled	40
-gamble	36
-intentional	32
-enemy	36
-where	38
-evenly	32
-emotionally	32
-paltry	32
-experimenting	32
-notification	32
-recapture	32
-underscored	32
-milling	36
-wanb	8
-finch	4
-tuesday	4
-wang	14
-efac	4
-anti-consumer	32
-ryan	4
-forbes	12
-subpeona	32
-banana-republic	32
-want	36
-lacroix	12
-skeptical	32
-weather	32
-single-a-1	32
-probable	32
-single-a-3	32
-generously	32
-exclusive	32
-striving	32
-variously	32
-practically	32
-hunters	36
-tire-worker	32
-aerospace	108
-passengers	32
-impression	32
-spate	32
-learns	32
-industrialized	32
-viruses	32
-juvenile	36
-tien	4
-four-year	32
-holyoke	4
-mexicans	6
-frydenlund	12
-co-founder	32
-payment	36
-weapon	32
-fairer	32
-boomlet	32
-top-grade	32
-currency-stabilization	32
-amendment	36
-number-crunching	32
-direct-sales	32
-sketch	32
-newspaper	32
-pricked	32
-culbro	4
-hands	32
-been	36
-volatile	32
-solvents	32
-refute	32
-incensed	32
-handy	40
-giving	40
-rate	108
-remove	32
-simple	96
-valid	32
-compensation	32
-simply	32
-usual	36
-funeral	36
-backer	32
-carving	32
-ambient	4
-costumes	32
-war-vintage	4
-write-in	32
-redemptions	4
-fared	32
-yugoslavia	4
-geographic	32
-chance	32
-gamse	4
-tabor	12
-self-evaluation	32
-double-a	32
-webb	12
-microchip	32
-spinach	32
-tdy	12
-displays	32
-reelection	32
-items	32
-resurgence	32
-shielding	32
-charity-ball	32
-rubric	32
-meanwhile	46
-installment	32
-rhi	4
-dls	8
-nasty	34
-mx	4
-singing	36
-prostitution	32
-clarity	32
-loan	100
-ml	8
-mecca	32
-close-knit	32
-pecora	4
-u.s.-canadian	4
-luv	8
-bioscience	36
-accepting	32
-load	46
-vast	32
-delfzijl	4
-frx	4
-mug	32
-construction	44
-restricted	32
-carlucci	4
-reduce	36
-epton	4
-remark	32
-peggy	4
-graphics	36
-instance	32
-clemenza	4
-echoed	32
-non-ibm	32
-hospitalization	32
-boogie	32
-cowbells	32
-requirement	36
-jeff	6
-roadways	32
-coca-cola	14
-mishima	4
-men	36
-graphic	6
-spoiler	32
-enteprises	4
-m2	4
-m3	6
-m1	4
-m.	12
-short-term	38
-debt-backed	32
-ludcke	8
-gypsum	36
-manna	32
-admiral	36
-perform	32
-blundered	32
-fillings	32
-songs	32
-dosage	32
-associated	38
-nashville	14
-truckloads	32
-wound	32
-reorganized	96
-ohio	4
-ranches	32
-sends	32
-hanging	34
-europe	14
-reading	38
-alden	12
-conventional-mortgage	32
-profound	32
-success	32
-medium-duty	32
-parts	100
-thoughout	32
-periodic	32
-five-country	32
-something-for-everyone	32
-salvaging	32
-sumo	4
-scrambling	32
-gleam	32
-long-distance	98
-resembles	32
-vandenburg	4
-unsolicited	32
-seafood	32
-envelopes	32
-color	36
-contraceptive	32
-respond	32
-anyhow	32
-sings	32
-ampersand	4
-extinguishment	32
-farm	46
-imperial	36
-salad	32
-terrace	4
-fare	32
-prides	32
-russell	6
-farming	32
-ahe	12
-memorably	32
-accuses	32
-howes	4
-monitor	36
-brazil	14
-flamingo	4
-cyprus	14
-artillery	32
-waited	32
-memorable	32
-reed	12
-surmises	32
-operational	32
-architectures	32
-pr	4
-accused	32
-contrary	32
-shave	32
-no-tax	32
-replaced	32
-annually	34
-swing	40
-r.i.	4
-private-ownership	32
-toughen	36
-celanese	4
-schlender	8
-blends	32
-tougher	32
-unaffiliated	32
-vira	4
-claudio	4
-expansion	32
-alma	36
-cyclops	14
-moore	12
-saxton	12
-anna	4
-revolving	32
-hostage	32
-fringe	32
-exacerbate	32
-specialists	32
-yellows	32
-watches	32
-capitalized	32
-greeted	32
-stands	32
-pettee	12
-signed	96
-longer	36
-el-barajneh	32
-habits	32
-socialists	4
-misused	32
-smuggled	32
-dreamed	32
-reptiles	32
-appointees	32
-watched	32
-mollusks	32
-reducing	36
-supervise	32
-disrupting	32
-tax-favored	32
-ireland	14
-liabilities	32
-oskar	4
-amenities	32
-afternoons	32
-beginners	4
-fail	36
-box-office	32
-unambiguous	32
-wounded	32
-comptroller	44
-trays	32
-lingus	4
-scrutinizes	32
-below	36
-incompetent	32
-yells	32
-mhp	8
-isbell	4
-hardgoods	4
-interprets	32
-caught	32
-attractively	32
-accosted	32
-insisted	32
-mhc	12
-wolverine	4
-trailing	32
-outpaced	32
-coursed	32
-forming	32
-inability	32
-vergennes	4
-suzdal	4
-miami	14
-steady	32
-compromising	32
-shore-grown	32
-sullivan	12
-chemical-physics	32
-ethnicity	32
-boardroom	4
-hull	32
-standard	46
-predicted	32
-well-informed	32
-rivals	36
-stutters	32
-choreography	32
-income	36
-parks	32
-gerlach	8
-oslo	10
-agriculture	36
-fairfield	12
-telex	32
-premium	32
-infringed	32
-franchised	32
-life-insurance	32
-espionage	32
-bur	8
-belgian	4
-contending	32
-heinike	4
-sharky	32
-bellwether	96
-fining	32
-violate	32
-ambiguities	32
-deflator	32
-paragraphing	4
-forgiven	32
-schrader	12
-amazement	32
-p.	4
-eugenia	4
-ltv	4
-particularly	34
-industrial-systems	32
-film-making	32
-recoup	32
-embarrass	32
-semester	32
-dramatized	32
-co-managing	32
-imports	102
-shine	32
-swung	40
-so-called	32
-pre-recorded	32
-infected	32
-icx	8
-icy	32
-laundering	32
-supply-side	32
-bailing	32
-scrutinize	36
-transmission	32
-leisure	40
-dramatizes	32
-bay-based	4
-camouflage	32
-icn	14
-upon	32
-previous	34
-truth	32
-bearish	32
-off	36
-pianist	2
-swallow	32
-plenty	32
-leaked	32
-campus	36
-remaining	32
-hype	32
-delayed	32
-lt.	4
-grousing	32
-hunter	36
-write-downs	32
-nano	4
-processor	32
-sedona	4
-opportunities	32
-neared	32
-frontrunners	32
-wrath	32
-laurie	4
-poison-pill	32
-commandos	32
-duthie	4
-drug-industry	32
-engine	36
-conversion	32
-bargain-hunting	32
-strained	32
-dependency	32
-sun-baked	32
-boczek	4
-promote	36
-little	46
-juan	4
-berrill	4
-home-dish	32
-ruled	32
-policies	36
-deadpan	32
-inflicted	32
-affable	32
-carlough	8
-ruler	32
-rules	36
-seventh	34
-melt-through	32
-tapping	32
-aircruisers	6
-gulf	46
-borrowing	32
-stings	32
-schuster	4
-protection	36
-prime-3	4
-prime-1	4
-refinanced	32
-insulated	32
-kneale	4
-those	46
-family-life	32
-conversation	32
-aesthetic	32
-weaver	12
-ah	2
-personal	32
-decorated	32
-boys	32
-sleep	36
-rustler	4
-grace	46
-reneged	32
-abalone-lover	32
-broker-adviser	32
-strengths	32
-halls	32
-re-elected	32
-plaintiffs	32
-discretionary	32
-vfc	8
-substantive	32
-decentralize	32
-australasia	4
-pen	32
-periods	32
-aspect	32
-granting	32
-ride	36
-bury	32
-polish	36
-imminent	32
-incredulous	32
-cooperate	32
-trust	100
-pep	40
-statistics	44
-w.va.	4
-danvers	4
-feasible	32
-refining	32
-odor	32
-rats	32
-frost	32
-securties	4
-f-20	4
-frost-retarding	32
-kamin	4
-deeds	32
-charles	14
-japan	14
-classic	36
-dissident	32
-pre-approved	32
-guarantee	32
-airline	40
-disproportionately	32
-loves	32
-playing	40
-losing	32
-food-processing	32
-live	38
-sustain	32
-incessant	32
-backup	32
-visiting	32
-theme	36
-exempts	32
-incontrovertible	32
-transworld	6
-wps	8
-filling	96
-prudent	32
-abandoning	32
-implications	32
-shackled	32
-loophole	32
-treading	32
-innkeeper	32
-fig	32
-victimized	32
-entrenched	32
-helicopter	32
-staunchly	32
-cujo	4
-fin	4
-jog	32
-camouflaging	32
-fit	32
-venal	32
-abound	36
-ocean	40
-confidence	36
-fis	8
-fir	36
-complex	36
-space-launch	32
-wants	96
-pastore	4
-fix	32
-movies	108
-little-known	32
-cheap-labor	32
-cbot	12
-fourth-quarter	110
-litigation	32
-refreshing	32
-relman	12
-up&l	6
-year-round	32
-austin	6
-suitcase	32
-mideast	4
-cboe	12
-chips	100
-dioxide	32
-u.s.-italian	4
-treated	32
-stringers	32
-funneled	32
-bestowal	32
-dreams	36
-circulated	32
-recalculated	32
-privy	32
-easier-to-use	32
-mid-teens	32
-oncor	2
-theoretical	32
-translates	32
-acceleration	40
-centre	4
-enchant	32
-margin	32
-much	102
-academicians	32
-medicine	36
-challenge	32
-react	36
-housing-finance	32
-admit	36
-appearances	32
-rebuttal	32
-shaib	4
-distortions	32
-bombed	32
-riled	32
-volume	46
-great	46
-alienating	32
-interest-only	32
-tightly	32
-territory	32
-mirror	36
-chart	32
-standards	36
-toughened	4
-hell	36
-objective	32
-motorist	32
-natchez	4
-intemperate	32
-advising	32
-held	44
-udf	4
-brewers	32
-redress	32
-ticketed	32
-reverses	32
-coloreds	32
-two	110
-tremendously	32
-help	36
-fargo	4
-journal/europe	4
-lemon	32
-ingersoll	4
-fledgling	32
-substances	32
-boveri	4
-rebels	36
-antics	32
-tfb	8
-big-time	32
-lovastatin	34
-halcion	4
-swell	32
-competence	32
-bashful	32
-bromberg	12
-upturn	32
-abreast	40
-budweiser	4
-t.pwf	4
-afflicted	32
-reached	38
-someday	32
-delaying	32
-trusts	32
-influenced	32
-overdrafts	32
-citizen	36
-directors	110
-squish	32
-magic	36
-large-screen	32
-eased	34
-reads	32
-indeed	42
-middlemen	32
-insider	46
-generate	32
-listened	32
-disposed	32
-determination	32
-arentsen	8
-express-parcel	32
-save	36
-builder	32
-wyden	4
-guns	32
-helpless	32
-campeau	4
-gunn	12
-settlements	36
-portions	36
-midst	36
-seemed	32
-all-natural	32
-recruiter	32
-monoclonal	32
-marketplace	32
-telegram	32
-raking	32
-czyrek	12
-columbia-based	4
-rejects	36
-workman	36
-three-hour	32
-due	108
-fine	38
-stacks	32
-sentimental	32
-ala.-based	4
-dun	12
-softwood	96
-grisdela	8
-bowl	4
-pga	8
-consumers	38
-elders	36
-legislators	32
-freezing	32
-walter	14
-steam	32
-pgn	8
-housing	102
-baby-boom	32
-calamari	32
-refuted	32
-financing	44
-reimbursement	32
-odds-makers	32
-steak	4
-deleted	32
-d	44
-quickie	32
-langdon	4
-fierce	32
-enhanced	32
-takeover-trading	32
-population	100
-unabashedly	32
-supplement	32
-levin	14
-marketer	32
-intimately	32
-equally	34
-digger	4
-disadvantage	32
-monica	4
-jacqmin	8
-peterson	4
-jean-pierre	4
-warplanes	32
-broad-based	32
-burdines	4
-t.pow	12
-starts	36
-lyric	32
-corn	36
-weekday	32
-injected	32
-three-cent	32
-lifting	34
-suit	44
-dependence	96
-core	32
-processed	32
-hauck	4
-j.p.	14
-machine-tool	32
-critic	32
-lingo	32
-paths	32
-slots	32
-high-level	32
-dank	32
-molecule	32
-steel-production	32
-vowing	32
-passenger	32
-corp	4
-indexed	32
-painful	34
-carey	4
-cause	32
-carew	8
-did	38
-weinberg	8
-reorganizations	32
-r.k.	4
-across	36
-indexes	36
-helping	32
-mcgraw-edison	4
-fifth	38
-snowstorm	32
-cared	32
-isgur	4
-jersey	4
-rohstoff	4
-kills	32
-ferns	32
-well-managed	32
-watson	12
-streeter	4
-floury	32
-medicines	32
-renovation	32
-scores	38
-scorer	32
-substitute	32
-tomatoes	32
-alicia	4
-monarch	14
-convincing	32
-municipalities	32
-movie	36
-scored	32
-seattle	12
-temper	32
-result	32
-corrugated	32
-creep	32
-creer	4
-lens	32
-enhances	36
-biotechnology	36
-occupied	34
-ordinance	32
-cqx	4
-stint	32
-doubted	32
-ramshackle	32
-delicately	32
-non-family	32
-energy	46
-urgently	32
-dutton	12
-archly	32
-creek	4
-goose	32
-defections	32
-improves	32
-backlash	32
-stanton	4
-splashy	32
-specimens	32
-texcel	10
-auto	102
-tendency	32
-scripting	32
-in-state	32
-polystyrene	32
-arc	8
-rte	10
-relocate	32
-employees	38
-blue-eyed	32
-improved	32
-eyeing	32
-reviving	32
-one-quarter	32
-entregrowth	6
-deteriorate	32
-mastermind	32
-narration	32
-ranging	32
-flooding	32
-reappeared	32
-malfitano	12
-forsythe	12
-swirly	32
-accordance	32
-maladies	32
-whr	8
-share-trading	32
-mature	32
-first-ever	32
-up-and-coming	32
-specialization	32
-senate	14
-pretty	32
-electric	110
-programs	36
-solids	32
-fined	32
-angolan	6
-showed	36
-ramada	12
-unseated	32
-yen-dollar	32
-constituencies	32
-frontier	36
-counts	96
-applause	32
-watch	38
-straits	4
-thou	4
-bare	2
-fines	32
-syntech	14
-introduction	38
-ceiling	36
-oreffice	8
-rye	4
-light-years	32
-westin	6
-ovens	32
-shares	46
-giveaway	32
-laura	4
-large-denomination	32
-small-time	32
-provisional	32
-copyrights	32
-individuals	34
-vs.	32
-toes	32
-safe	32
-shared	32
-congregations	32
-rubel	8
-discourage	32
-peking	8
-fiefdom	32
-diversifications	32
-unraveling	32
-luxury	32
-retrench	32
-aircraft	100
-bistro	32
-rubes	4
-search	44
-dive	32
-mercurial	32
-endorsing	32
-scotland	4
-wrested	32
-madonna-meets-the-duchess-of-windsor	4
-lachenbruch	4
-judd	4
-problems	40
-hynes	12
-eurodollar	14
-vortmann	8
-x-%chg	32
-retooled	32
-tinged	32
-illegality	32
-redesignation	32
-polaroid	14
-soul	32
-refusal	32
-silently	32
-motivated	32
-tailored	32
-arrives	32
-herwitz	4
-sour	32
-dimensionless	32
-soup	36
-aaron	4
-muller	12
-hear	32
-post-it	4
-lawmakers	34
-dense	32
-basing	36
-truck-assembly	32
-daisy	4
-boldly	32
-delays	40
-termed	32
-brokerage	98
-christian	4
-controversies	32
-comic	32
-kuan	4
-usair	14
-cline	8
-crankshaft	32
-shamrock	14
-compensate	32
-salamon	4
-minicomputers	32
-contradictory	32
-shedding	32
-provincial	32
-lear-siegler	4
-cooker	32
-handkerchief	32
-rel	12
-located	32
-explain	32
-genenchem	4
-controllers	32
-reactivated	32
-boards	32
-untraveled	32
-devastating	32
-renamed	32
-microscopic	32
-swedish	14
-cheese	32
-newsman	32
-inexpensive	32
-rescue	32
-royce	4
-seidman	12
-pathology	32
-mechanicsburg	8
-torrid	32
-inescapable	32
-northeast	36
-shattered	32
-emerson	14
-honeywell	6
-assure	32
-anglican	4
-springfield	4
-panasonic	4
-chamberlain	12
-midcon	4
-friend	32
-less-powerful	32
-appraisal	36
-void	32
-strides	32
-large	36
-fruits	32
-holiday-shortened	32
-non-voting	32
-overheated	32
-ricupero	4
-emporia	12
-unfriendly	40
-advances	38
-estimates	96
-breaded	32
-seattle-based	4
-fugitive	32
-estimated	34
-advanced	46
-furthering	32
-wellington	8
-litan	8
-captured	32
-educators	32
-hydraulic	32
-countries	36
-westinghouse	4
-manages	32
-manager	96
-genicom	6
-detectors	32
-needn	32
-environments	32
-formula	32
-inspectors	32
-replenished	4
-tiaa-cref	4
-diminutive	32
-element	32
-midday	32
-contractor	32
-fantastic	32
-public-works	32
-negotiator	32
-preface	32
-throes	32
-marines	4
-privatization	32
-salute	32
-rodeo	36
-james	14
-associations	32
-spielberg	12
-limbo	32
-exercising	32
-chalmers	4
-hollis	8
-placement	36
-appeasing	32
-smiths	4
-fanfare	32
-wry	32
-roosevelt	4
-butcher	36
-punjab	4
-iroquois	4
-billy	4
-humphrey	4
-foolproof	32
-incorrect	32
-gadhafi	12
-mess	32
-housewife	32
-income-producing	32
-language	32
-textron	10
-newly	32
-pervading	32
-toiling	32
-westamerica	14
-post	36
-pair	32
-kwon	8
-disagreeable	32
-subsides	32
-gangster	32
-foreseeable	32
-exiting	32
-hides	32
-scrupulous	32
-myself	32
-pose	36
-spook	32
-delicacy	32
-toxic-waste	32
-graceful	32
-oversaw	32
-pasadena	12
-phenomenal	32
-satirist	32
-gallery	36
-erase	32
-parent	100
-morella	4
-liquidation	32
-severance	32
-casually	32
-conduits	32
-shortening	32
-fallacies	32
-practices	36
-filming	32
-lady	36
-tags	32
-dictatorial	32
-breakers	32
-calif.	4
-dubroc	4
-sponsor	32
-tunnel	32
-occurring	32
-afterward	32
-verdict	32
-breakfast	36
-damages	32
-danse	32
-tenancy	32
-positioned	32
-massaged	32
-course	32
-who	108
-fortress	32
-content	32
-adamant	32
-constituting	32
-yuppie	36
-industrial-talc	32
-erich	8
-enforces	32
-practicing	32
-bookkeeping	32
-fired	32
-overcall	32
-commissars	32
-fraudulent	32
-goal	36
-loynd	8
-unit-trust	32
-supervisor	32
-striking	34
-variation	32
-goat	32
-macedo	4
-enforced	32
-promised	32
-symbols	32
-defuse	32
-psychiatrists	32
-courier-journal	8
-therapy	32
-sandwich	32
-schwartz	4
-hinton	4
-guyford	8
-ian	4
-suffering	32
-wrongful-discharge	32
-hubs	32
-redoute	4
-contingency	32
-presidente	4
-calculated	32
-thin	32
-chiles	12
-semiconductor-manufacturing	32
-sick	12
-bucks	32
-this	110
-wrongdoers	32
-wilfred	4
-bulging	32
-glasnost	2
-baritone	32
-hoover-dempsey	4
-emergence	32
-responses	32
-baltimore-washington	8
-flag	32
-recommendations	32
-institutional	102
-fca	4
-union-busting	32
-fires	64
-schroder	4
-crazy	38
-rudin	4
-cologne	4
-flak	32
-mom-and-pop	32
-getting	38
-flat	36
-provoking	32
-barring	36
-craze	32
-intuitively	32
-optional	32
-crickets	32
-target-company	32
-establishes	32
-defrauded	32
-transaction	36
-threatening	32
-feeling	32
-blossoms	32
-southwestern	32
-unjustified	32
-freeman	12
-alexandria	4
-beginnings	32
-polonsky	4
-affiliate	32
-nonpublic	32
-unlikely	32
-cherry	8
-kotlowitz	4
-flamboyant	32
-scoring	32
-passages	32
-all-knowing	32
-t.fcy	4
-protests	32
-ashore	32
-loewi	4
-character	32
-neutralist	32
-silversmiths	32
-impedes	32
-chillicothe	4
-wage-price	32
-bovine	32
-pa.	4
-chromed	32
-plus	36
-insurer	36
-ambiguous	32
-non-financial	32
-maybe	38
-turnout	34
-delta	14
-joints	32
-muscle	32
-sure-fire	32
-blunt	36
-nursery	4
-air-cargo	32
-spoiling	32
-taxpayers	36
-lt	8
-overregulation	32
-trident	4
-bright	32
-drastic	32
-usable	32
-hazardous	32
-atico	14
-begin	96
-chaotic	32
-le	34
-different	40
-specially	32
-multibillion-dollar	32
-deregulating	36
-steering	32
-provider	32
-lo	32
-abbreviation	32
-waste-water	32
-staley	8
-oiler	32
-pac	4
-prodigious	32
-mammoth	36
-kaplan	12
-assisting	32
-horse-racing	32
-doldrums	36
-dicey	32
-theory	32
-car-assembly	96
-big-college	32
-pat	4
-pas	4
-par	32
-funny	32
-pap	4
-question	32
-cook	4
-pleasurable	32
-pay	36
-supermerchant	32
-dissemination	32
-thrilled	32
-poland	6
-trading	46
-misunderstood	4
-coupled	42
-generators	36
-ambulatory	32
-currency-exchange	32
-thriller	32
-vegetable	32
-now-dormant	32
-merchant-banking	32
-various	34
-eye	32
-lawsuits	38
-seneca	4
-superannuation	4
-purposes	32
-reimposition	32
-unleashing	32
-bodies	32
-rouged	32
-f-100	4
-grieux	4
-deliberate	32
-passing	32
-quirky	32
-looming	32
-baldrige	4
-drums	32
-meritless	32
-cnn	4
-compulsory	32
-collectively	32
-falconbridge	4
-runoff	32
-yelled	32
-hormone-treated	32
-bros	4
-lindley	4
-laureate	32
-victory	32
-checking	32
-match	32
-action-forcing	32
-well-capitalized	32
-recent	42
-sanford	12
-arrested	38
-hand-wringing	32
-burry	8
-frantic	32
-residents	32
-fun	36
-pared	4
-residential	34
-neronian	4
-rko	4
-capture	32
-deadlines	32
-clients	36
-fourteen	2
-testified	32
-redeem	36
-pre-trial	32
-hunt	44
-have	100
-simultaneously	32
-testifies	32
-baked	32
-sagging	32
-iraqis	4
-one-cent-a-share	32
-prevost	4
-deux	4
-mestizo	32
-rail-car	32
-rican	4
-breasts	32
-abington	4
-outdo	32
-generals	32
-staffer	32
-sumitomo	4
-savvy	32
-still	42
-gracefully	32
-contributing	34
-rather	38
-negatives	32
-rewritten	32
-public-sector	32
-agency	46
-nicest	32
-players	36
-wabc-tv	4
-inconclusive	32
-expected	36
-oresman	8
-obliquely	32
-innovation	32
-csx	14
-northrup	4
-getty	12
-calny	14
-dryden	8
-stimulants	32
-purcell	4
-near-infrared-spectroscopy	32
-brings	32
-descendant	32
-rifadin	4
-forth	32
-treats	32
-f.j.	4
-dirks	4
-cen	8
-treaty	36
-weekly-average	32
-roughly	32
-demonstration	32
-expense-reduction	32
-mcgaw	4
-montera	4
-maneuver	32
-rethink	32
-malted	32
-overshot	32
-mulroney	12
-prolong	32
-educate	32
-wars	36
-realities	32
-entrance	32
-discoloring	32
-wary	40
-of	126
-mcgegan	4
-enlightened	32
-ward	44
-water	36
-research	102
-clemens	8
-hedging	32
-warm	32
-warn	32
-montero	4
-home-mortgage	32
-contracted	40
-baer	4
-mamelodi	4
-reproductive	32
-pastel	32
-males	32
-pledging	8
-asian-based	4
-descending	32
-entrepeneurial	32
-walker	4
-poindexter	12
-computer	38
-on	110
-zealously	32
-portion	32
-seized	32
-plastic	32
-nashua	12
-indisputable	32
-uncomfortably	32
-coordinated	32
-promotes	32
-promoter	32
-soldering	36
-exceptional	32
-sell	36
-scrawled	32
-uncomfortable	32
-michael	14
-stockyards	4
-gorki	4
-confections	32
-keynes	8
-attack	40
-attach	32
-sgic	4
-intentionally	32
-alexanders	4
-prohibited	32
-plunges	4
-ashton	4
-bridesmaid	32
-under	110
-constructed	32
-fire	36
-suspense	32
-pool	32
-nabisco	4
-useful	32
-plunged	36
-involving	44
-atlanta	14
-belding	4
-tact	32
-review	44
-lessened	32
-chases	32
-rosenfeld	4
-proliferating	32
-tent-meeting	32
-establishments	32
-response	32
-went	96
-discotheques	32
-vehicle-leasing	40
-private	34
-skimp	32
-preventive	36
-flurry	32
-floyd	6
-sir	46
-chased	32
-cruise	36
-distributing	32
-wyomissing	8
-change	36
-wallace	12
-convertibles	4
-educational	36
-yacht	4
-six-to	32
-sayad	4
-arnold	14
-publicizing	32
-weathered	32
-levittowns	4
-swamp	32
-practical	32
-customer	32
-tortuously	32
-crematories	32
-julius	6
-anthropologists	32
-reagan-proposed	4
-downsizing	32
-less-upholstered	32
-disgrace	40
-arrival	32
-nonferrous	32
-public-broadcasting	32
-speak	32
-tripled	32
-contemporary	32
-dilute	32
-genres	32
-maintains	36
-spear	32
-theatricality	32
-thicker	32
-distinguish	32
-vow	32
-spoke	32
-seems	32
-ky.-based	4
-once-thriving	32
-economists	46
-independent	34
-employee-benefits	32
-curacao-registered	4
-buchanan	12
-ailing	36
-ef-18a	4
-payoffs	32
-closet	32
-washington	14
-lipman	4
-filthy	4
-fine-tuned	32
-tomihiro	4
-aryan	4
-kong-based	4
-representative	36
-dubbed	32
-upham	4
-missions	32
-execute	32
-leakage	32
-neighbor	32
-renters	32
-discriminatory	32
-general-purpose	32
-atlasair	4
-pharmacy	4
-nine	38
-smoke	32
-colony	32
-nassau	4
-controversial	32
-external	32
-leanest	32
-protest	36
-unannounced	32
-covert	36
-worldwide	36
-confesses	32
-pacemaker	96
-pullman	12
-ecology	4
-likens	32
-lawsuit	32
-prognosticators	32
-brown-forman	4
-palestine	4
-conclusive	32
-distiller	32
-stereos	32
-hampshire	4
-lock	32
-missouri	4
-consciences	32
-reaction	32
-project	36
-c.v.	4
-counterpoint	32
-dependent	32
-roux	12
-macintyre	4
-clyde	4
-etched	32
-keep	36
-fresno	4
-choosing	32
-atom	36
-arson	32
-installing	32
-tycoons	32
-directors-and-officers	32
-possessed	32
-nsc-directed	4
-upscale	32
-c.	6
-co-head	32
-symmetrical	32
-thursday/sunday	4
-garcia	8
-municipal	36
-actors	32
-channell	12
-likenot	32
-perspective	36
-yuppies	32
-botlek	4
-two-year	32
-apartheid	32
-laying	32
-debacle	32
-major-league	32
-graft-vs.-host	32
-alumni	32
-cropping	32
-midrange	32
-richard	14
-left-wing	32
-intensive	32
-causes	32
-cq	8
-worker	36
-suskind	12
-e-6a	4
-call	34
-neatly	32
-cz	8
-mid-1960s	32
-berth	32
-co-artistic	32
-oriental	4
-exotic	32
-worked	32
-ms.	14
-believing	32
-parry	8
-agenda	32
-cl	8
-well-received	32
-filled	32
-revision	32
-setting	32
-majorities	32
-complications	32
-compounded	32
-thiokol	4
-molders	6
-diving	34
-pmsc	4
-activator	32
-following	42
-door-to-door-delivery	32
-rekindle	32
-deterrent	32
-jurors	32
-i`ve	4
-wpob	4
-nasa	2
-downey	4
-much-larger	32
-partners	36
-inhabit	32
-pandora	4
-caused	32
-p.a.m.	4
-henry	14
-marred	32
-golden	44
-arbitrage	36
-inflict	32
-acquiescent	32
-orben	4
-ginning	32
-tuxedos	32
-pre-dated	32
-bedford	12
-deteriorated	32
-fortifying	32
-midgets	32
-b.a.it	8
-hagan	8
-pessimism	32
-nonexistent	32
-bottlers	32
-death	38
-checkers	4
-transportations	4
-arrange	32
-simi	4
-portland	14
-fiber	32
-baucus	12
-flindt	4
-recorded	32
-alasdair	4
-black-and-white	32
-gutzwiller	8
-ratio	32
-ogorodnikov	4
-counting	34
-arriving	32
-elimination	32
-underwrite	32
-drinks	32
-determine	32
-curtailment	32
-recorder	32
-money-market	4
-israelis	4
-osha	4
-glazer	12
-cash	38
-high-tech	96
-consensus	32
-lurked	32
-mitsubishi	4
-cycle	32
-weston	4
-drain	32
-stirred	32
-winegardner	12
-wonders	32
-bollinger	10
-concerned	32
-anti-japanese	32
-vanoff	4
-purchasers	34
-toga	32
-cultural	32
-shipbuilding	36
-philosophy	32
-masse	32
-financial-aid	32
-hayden	4
-priced	98
-societal	32
-elsinore	2
-continuous	32
-doorstep	32
-shirt	32
-ski	32
-miniseries	32
-malvern	4
-trained	34
-sporadic	32
-themselves	32
-valentine	12
-unsold	32
-goings-on	32
-disproportionate	32
-salvo	32
-raiders	32
-viewers	38
-blame	34
-corresponds	32
-eastbound	32
-enga	4
-bombs	32
-gameshow	8
-notified	96
-price-slashing	32
-things	34
-repeatedly	32
-robins	14
-tentatively	96
-linkup	6
-unknowable	32
-creditors	36
-iran	6
-kolber	8
-communications-based	32
-hammered	32
-gets	44
-succession	32
-angle	32
-cameras	38
-low-sulfur	32
-deandome	4
-service-company	32
-record-keeping	32
-caw	4
-leads	96
-snowy	32
-jarring	32
-bayh	4
-iraq	6
-untenable	32
-crows	32
-serves	32
-regular	40
-pre-payment	32
-fremantle	4
-norris	4
-minute	32
-jamie	2
-motives	32
-aids-treatment	4
-deny	32
-bradstreet	4
-elmwood	8
-ak-47	4
-tra	4
-non-oil	36
-out-of-court	32
-bs	8
-heir	32
-screen	36
-bt	4
-two-thirds	32
-separation	32
-by	126
-nondurable-goods	32
-benched	32
-economist	40
-tumbling	32
-mile	36
-bb	4
-ba	12
-bg	4
-be	108
-satisfaction	32
-bk	8
-mill	36
-leased	32
-interstate	46
-bn	8
-milk	32
-hve	8
-clever	32
-monarchs	32
-plumbing	32
-sticker	32
-friction	32
-pro-rata	32
-est	12
-segundo	4
-mckanic	4
-co-op	32
-flexi-van	4
-sending	36
-anti-climactic	32
-vatican	4
-optic	32
-partly	34
-controversy	32
-regrouping	32
-brio	8
-orbiting	32
-supporters	96
-shafts	32
-take	102
-per-minute	32
-slight	32
-minds	32
-sandinista	4
-rim	4
-rio	12
-car-parts	32
-vastness	32
-undetermined	32
-mcmxxv	4
-radio-television	4
-abalone	38
-confirming	32
-kerley	8
-beaver	4
-standstill	32
-distribute	32
-raburn	4
-arms-sales	32
-city-council	32
-crummy	32
-accidental	32
-space	36
-panicky	32
-bethlehem	12
-credit-research	32
-prime	38
-reimposed	32
-slash	32
-exhibitions	32
-pipeline	32
-woodville	8
-splintered	32
-monopoles	34
-representations	32
-mid-march	32
-best-managed	32
-movements	32
-third-largest	32
-binding	32
-private-property	32
-dislocated	32
-awarded	32
-everyday	32
-hoyt	4
-universities	96
-mortgage	36
-natan	4
-palates	32
-drilling	36
-additive	32
-incremental	32
-boozer	32
-driven	32
-puat	4
-federalism	32
-argue	32
-enrile	4
-mouths	32
-pennzoil	14
-they	46
-portable	44
-regulate	32
-doors	32
-minichain	32
-females	34
-archaic	32
-gloom	4
-stalked	32
-eight-month-old	32
-them	32
-then	126
-setbacks	32
-shoes	32
-orderly	32
-maghdousheh	4
-blond	34
-director-generalship	32
-culver	4
-protectionism	32
-merged	32
-positive	32
-trifle	32
-prophets	32
-underpaid	32
-chevrefeuille	4
-clarinetist	32
-capability	32
-lobbyists	32
-negligible	32
-persist	32
-perversely	32
-yardstick	32
-sacco	4
-classics	32
-deserve	32
-guest	32
-bacha	12
-muscular	32
-foldessy	8
-development	36
-freezes	32
-assistants	32
-cardholder	4
-imagery	32
-introduce	36
-andre	6
-stopping	34
-thick	32
-brackets	32
-asics	4
-arrangement	32
-dominion	14
-desks	32
-imposition	96
-slope	4
-escrow	32
-renault	8
-credibility	32
-superintendents	32
-brilliant	32
-zink	12
-measured	32
-blessing	32
-sent	32
-diego-based	4
-keating	12
-darkly	32
-doctor	32
-coins	32
-apple	6
-relish	32
-organizational	32
-speculates	32
-confection	32
-haunting	32
-mistake	32
-send	32
-pre-agreement	32
-uranium	32
-substitutes	32
-compulsion	32
-cathy	4
-rebellion	32
-trustee	32
-moderately	32
-esther	4
-kaye	2
-details	34
-motors	36
-low-margin	32
-knowledgeable	32
-occidental	2
-increasingly	38
-engage	32
-infectious	36
-twist	32
-jewel	32
-ivory-tower	2
-liberal	36
-distinctly	32
-enjoined	32
-irritating	32
-backdrop	32
-elementary	32
-entire	32
-towing	32
-database	32
-farce	32
-coordinate	32
-hubert	4
-cattle-on-feed	32
-tends	32
-sen.	6
-karen	4
-tests	46
-spinoff	36
-soundness	32
-benton	8
-seven-page	32
-bored	32
-low-light	32
-morton	14
-looks	44
-whittaker	6
-jawbone	32
-innovative	32
-hunkered	32
-horrified	32
-invested	32
-movement	36
-gutsy	32
-investigators	46
-elgie	4
-unforgivable	32
-escalating	32
-dimona	4
-resale	4
-quicker	32
-terse	32
-sara	8
-exterior	32
-amalgam	32
-nomination	40
-debts	36
-chisholm	12
-godiva	4
-substituted	32
-risk-taking	32
-ci	8
-iosif	4
-walhalla	4
-selas	12
-saliva	36
-dolls	32
-ardently	32
-diaper	32
-negotiations	32
-over-the-counter	32
-stakes	36
-dolly	4
-dreaded	32
-created	34
-defend	36
-accountant	32
-anticipating	34
-wilmette	4
-perceptible	32
-fhp	4
-shelter	32
-extracting	32
-aired	32
-alleging	32
-four-way	32
-detailing	32
-subdued	32
-airs	32
-popularize	32
-shopping	36
-relatively	32
-bowing	32
-cosmetic	32
-motels	32
-vied	32
-baseless	32
-brash	32
-porter	4
-redefine	32
-embraced	32
-gatherings	32
-b.f.	4
-fallacy	32
-grain	32
-asbestos-related	32
-gourmets	36
-pcs	4
-rectify	32
-cro-magnons	4
-embittered	32
-reinterpretation	32
-stockholder	32
-funded	32
-governments	32
-legitimize	32
-mordant	32
-appeared	32
-franklin	12
-ambivalence	32
-pave	32
-infants	32
-thrifts	40
-surfacing	32
-niches	32
-routine	32
-routing	32
-ribaminol	34
-refined	32
-turn	32
-victims	36
-essays	32
-issuance	32
-earings	32
-hostile	40
-turf	32
-landmarks	32
-launch	32
-financial-futures	32
-repeated	34
-early	106
-concentrates	32
-picked	32
-overexposure	32
-properly	32
-caramels	32
-likely	36
-simian	32
-lebanon	4
-restore	32
-spear-throwing	32
-lust	32
-depth	32
-mentor	4
-concentrated	32
-germ-warfare	32
-chess	32
-read	44
-chest	32
-ern	4
-surge	36
-bachelor	32
-real	46
-refrain	32
-rows	32
-innkeepers	32
-ya	32
-oestreich	12
-n.d.	4
-reap	32
-liggio	4
-investigation	100
-firmed	32
-anti-sandinista	32
-cane	36
-dail	4
-voting	36
-trudeau	4
-admirers	32
-scintillation	32
-inmates	32
-relinquish	32
-crack	32
-erg	4
-vilify	32
-higgins	4
-gladly	32
-technologic	4
-mothers	32
-intensity	32
-seven-state	32
-blocked	32
-maximizing	32
-teri	6
-unacceptable	32
-trend	32
-bombers	32
-repulsed	32
-micheal	2
-subscription	32
-dorfman	8
-oh-god-if-i-pull-this-one-off-i	4
-lest	32
-less	38
-cocolat	4
-numbering	32
-non-technical	32
-tourist-drawing	32
-post-colonial	32
-gesture	32
-developed-country	32
-organizes	32
-organizer	32
-lesa	4
-strip-searched	32
-quotes	32
-resolution	34
-renewed	32
-gambler	32
-lines	44
-first-rate	32
-four-wheel-drive	32
-gratitude	34
-massoud	4
-gambled	4
-free-wheeling	32
-executes	32
-quoted	32
-targeted	32
-wasteful	32
-fiery	32
-cops	32
-survey	36
-abuses	32
-parkway	4
-exam-preparation	32
-successes	36
-attendants	32
-windfalls	32
-brace	4
-personal-income	32
-mind	32
-bcl	8
-hoffmann	4
-health-products	32
-bcw	8
-bcv	8
-constituents	32
-duty	100
-cephalosporin	32
-cambridge	4
-bcy	4
-info	4
-brushed	32
-moderate	32
-x	12
-cinematic	32
-hobbies	32
-airing	32
-default	32
-hambrecht	4
-conservancy	32
-alix	12
-five-point	32
-terror	2
-south	44
-breeds	32
-bailed	32
-atuwi	4
-quarterlies	32
-procedural	32
-heirs	32
-psychological	32
-distinctive	32
-likes	32
-england-based	4
-alive	32
-teneff	12
-liked	32
-one-man	32
-fixed-income	2
-unfilled	32
-outerspace	32
-stock-index	32
-ediger	8
-criminal-sentencing	32
-lumber	102
-dismissals	32
-obsessive	32
-primarily	32
-benito	4
-ships	32
-econometric-forecasting	32
-occupation	32
-downturns	32
-luftwaffe	4
-kearney	12
-macmillan	14
-deployed	32
-achievement	36
-fertility	32
-photovoltaic	32
-bolling	4
-actually	38
-baseball	32
-strife	32
-reports	110
-magazines	36
-darwinism	4
-bolsters	32
-verge	32
-manuverings	32
-dial	32
-conspiring	32
-refuses	36
-throw	32
-bailey	4
-factions	36
-processes	32
-checks	36
-rabbits	4
-enex	10
-inhalable	32
-fending	32
-rah	12
-stalling	32
-fleets	32
-handle	32
-mailings	32
-join	36
-eastern	46
-unhappy	40
-accelerate	32
-witching	32
-reference	36
-belove	4
-disparities	32
-redford	4
-marshall	2
-toyoy	4
-frederick	14
-main	36
-fallout	32
-mail	36
-dollar-holders	32
-mair	8
-invaders	32
-nutrition	32
-myg	8
-outgrowths	32
-marshals	96
-payout	44
-comprehend	32
-precipitating	32
-collections	32
-antipathy	32
-cost-effective	32
-testifying	2
-wonderful	32
-quietly	32
-tbs	8
-philhellene	32
-provocative	32
-npt	8
-teachers	46
-honesty	32
-wildlife	4
-typically	34
-punishing	32
-gilgore	12
-capital-intensive	32
-thrombolytic	32
-scattered	32
-wide-body	32
-import	32
-squads	32
-sunny	32
-dillard	4
-obstruction	36
-ieoc	6
-shell	32
-dangerously	32
-free-trade	32
-sluggish	34
-all-cash	32
-asked	98
-molds	32
-downtown	32
-function	36
-heaven	32
-slack-jawed	32
-deal-making	32
-blood-clot	32
-housewives	32
-galactic	32
-normal	36
-presently	32
-batteries	32
-svb	4
-nevada-las	4
-significant	32
-briefcases	32
-shies	32
-cocktail	32
-tenderness	32
-redeemable	32
-unwieldy	32
-hog	34
-contacted	32
-based	46
-cherished	32
-teicher	4
-february	4
-extraparliamentary	32
-bases	32
-acquisition-minded	32
-immediately	32
-rides	32
-takeover	44
-midtown	32
-endorse	32
-notations	32
-hovers	32
-alluding	32
-sachs	4
-fool	36
-relating	32
-advisers	36
-ic	14
-sachy	8
-white	46
-food	46
-noncaloric	32
-squashed	32
-racking	32
-earnings	46
-impressive	32
-wriv	4
-warring	4
-dinner	44
-karachi	4
-drafting	32
-swinging	32
-frankfurt	12
-protections	32
-lachica	4
-lovable	32
-isaacs	8
-evident	32
-largest-selling	32
-eliminated	32
-condom	32
-camino	4
-individually	32
-december	14
-locate	32
-worsened	32
-building	38
-strife-torn	32
-negotiators	32
-composition	32
-samsung	4
-modify	32
-retailing	40
-truly	32
-photocopier	32
-year-end	34
-consumer	46
-manufacturers	110
-market-by-market	32
-owe	32
-benchmark	32
-superior	36
-production-quota	32
-three-judge	32
-containing	32
-consumed	32
-rover	6
-warrant	96
-abuse	32
-gobain	4
-broadcasting	38
-overstylized	32
-oblivious	32
-higher	38
-americano	4
-two-week	32
-pugh	8
-dalkon	4
-imsi	4
-atkinson	4
-ambj	8
-iq	4
-airbus	6
-rated	32
-differing	32
-profession	32
-alleviate	32
-americans	4
-otter	32
-starring	32
-grass-roots	32
-throwing	32
-covey	32
-drams	4
-crzy	8
-cholesterol-lowering	32
-forestalls	4
-cover	36
-guideline	32
-announces	32
-develop	32
-erroneous	32
-sesit	8
-drama	32
-wessel	4
-irresponsible	32
-champ	32
-basics	32
-unionizing	32
-festering	32
-profit-taking	36
-ether	32
-everybody	38
-t3100	4
-bailly	4
-shiraz	4
-soonest	32
-radioactivity	32
-inaccuracies	32
-houses	32
-crucial	36
-archibald	4
-deducting	32
-stride	32
-readable	32
-mason	14
-disappointment	32
-namely	2
-largest	100
-housed	32
-cameron	4
-opposing	32
-circulating	32
-gorbachev	14
-startled	32
-cptc	4
-videotape	32
-blair	12
-agreed-upon	32
-consolidations	32
-agreeing	32
-tribute	32
-justin	4
-terrence	4
-antonio	4
-antonin	4
-government-owned	32
-three-member	32
-blacklisting	32
-difficulties	32
-accountability	32
-bothering	32
-advent	32
-leggett	8
-uncriticized	32
-re-regulating	32
-anastasio	4
-substantial	32
-mcgraw-hill	6
-notify	32
-appearing	32
-fate	32
-classmate	32
-brighter	36
-partial	32
-reliability	32
-entities	32
-emotions	32
-financiere	14
-sneak	32
-home-equity	34
-companions	32
-bechtel	4
-answered	32
-vern	4
-stalin	4
-discomforts	32
-keeping	38
-kapor	12
-fold	32
-pullouts	32
-purer	32
-drivers	32
-perron	4
-anti-managua	32
-unemployment	46
-netx	4
-reevaluate	32
-ness	4
-very	32
-christensen	14
-coronation	36
-uncover	32
-alberto	4
-untaxed	32
-logicon	4
-rakoff	8
-headline	32
-fund-raisers	32
-especially	34
-salt	44
-debra	4
-spreadsheets	36
-roland	4
-clevite	12
-fulfill	32
-thorny	32
-plo	4
-subway	32
-beast	32
-tightened	32
-hilton	6
-sale	44
-resource	36
-downturn	32
-reassessing	32
-popularity	32
-guides	32
-conference	36
-speaks	32
-greg	4
-forum	46
-elite	36
-unfunded	32
-guided	32
-communicate	32
-could	108
-alltime	32
-saddle	36
-historical	32
-patricia	6
-grew	40
-orion	14
-activities	36
-conceivably	32
-committed	32
-committee	38
-motion-picture	32
-bundesbank	4
-negotiates	32
-tark	4
-referees	32
-mets	8
-wrote	36
-listening	32
-ruling	44
-lefler	8
-manufacturing	108
-upkeep	32
-miller	14
-week-old	32
-pmk	8
-findings	32
-advertisers	32
-largesse	32
-apropos	2
-pmn	8
-yuri	4
-surely	34
-stuart	6
-stricter	32
-lousy	32
-caseload	32
-quist	4
-flexible	32
-unearthed	32
-trijets	32
-lead/acid	32
-dell	4
-faltering	32
-depths	32
-mutually	32
-physics	36
-blenders	32
-fairchild	4
-better	34
-pwj	4
-render	32
-upper	44
-kissinger	4
-divisive	32
-near-great	32
-tomahawk	4
-unconditional	32
-elderly	32
-reimbursements	32
-version	32
-hovering	32
-u.n.	4
-casino-hotel	32
-hoopla	32
-harmon	4
-velvet	4
-meal	32
-ploy	32
-geraldine	4
-hpc	4
-continuing-education	32
-dilutive	32
-talented	32
-twenty-five	4
-cleocin	4
-icing	32
-reaganites	4
-annual	32
-trial	36
-hobart	2
-leaving	38
-francisco	4
-goldin	4
-kanter	12
-textile	32
-against	46
-allege	32
-toll-free	34
-shoo-in	32
-recouped	32
-technicon	4
-more-focused	32
-dismayed	32
-arv	8
-endless	32
-fernao	4
-marshaling	32
-hutcheson	4
-out-of-state	36
-cupid	2
-dramatically	32
-ron	12
-irish-american	4
-rok	8
-venues	32
-thenceforward	32
-row	36
-home-shopping	32
-bermingham	4
-womanish	32
-wool	32
-midlevel	32
-relationship	32
-roy	6
-klopfenstein	4
-hilarious	32
-portables	32
-infant	32
-presplit	32
-henley	14
-hawk	4
-persons	32
-depending	34
-follow-on	32
-discredit	32
-wells	44
-magnavox	6
-calculation	32
-open-air	32
-policy-coordination	32
-candlelight	32
-prillaman	12
-sums	32
-ktcc	8
-acy	8
-montreal-based	4
-act	36
-currency	110
-exploration	36
-january	14
-allotted	32
-ashton-tate#	8
-merger-advisory	32
-company-wide	32
-richmond	4
-ace	32
-outnumbering	32
-hazardous-waste-management	32
-bbdo	4
-completes	44
-imaging	32
-acc	4
-bay	36
-bax	12
-razzak	4
-impose	36
-backfire	32
-involve	32
-crusading	32
-slvn	8
-bar	36
-bat	34
-square-toed	32
-engineered	32
-milan	6
-holland	4
-just-in-time	32
-national	46
-efficiency	32
-ban	36
-teams	32
-bab	12
-bad	36
-bag	32
-husband	32
-econometric	4
-euphemisms	32
-judged	32
-downing	32
-tidal	32
-affection	32
-eluded	32
-judges	32
-amitai	8
-tenn.	4
-connections	32
-chocolatiers	32
-gaming	4
-chastening	32
-superconducting	4
-fitness	32
-communicating	32
-tube	36
-foxborough	4
-zap	32
-rendition	32
-wausau	4
-mailer	4
-lopped	32
-unfinished	32
-dockets	32
-desk	32
-contradict	32
-fanned	32
-ottawa-based	4
-cell	32
-cue	32
-per-unit	32
-several	46
-favor	32
-hide	32
-denunzio	12
-cando	4
-completed	32
-cup	36
-braces	32
-franchisee	32
-cut	100
-hamburgers	32
-overlook	32
-adventure	32
-letting	32
-preposterous	2
-proxy	36
-trapped	32
-contains	32
-publisher	32
-narrowing	36
-loosened	32
-admen	34
-keel	36
-keen	32
-elusive	32
-cases	36
-woodruff	8
-secured	32
-authorize	32
-cancellation	40
-adjusted	98
-frates	8
-casey	14
-bang-them-over-the-head	32
-affair	36
-penny-stock	32
-aside	36
-zany	32
-careers	38
-little-noticed	32
-jewish	6
-openings	32
-adventurers	32
-invest	32
-mckerrow	4
-lemonade	32
-exaggeration	32
-maestro	32
-reputable	2
-rtn	12
-pharmaceuticals	44
-paris-dakar	4
-nolde	4
-asbestos-like	32
-coating	32
-serving	32
-disintegrate	32
-eating	32
-microsystems	4
-bolster	36
-grill	32
-nuala	4
-senior-college	32
-gate-side	32
-scholarships	32
-misled	32
-subtle	32
-fearful	32
-derivative-action	32
-communist	38
-oil-price	32
-reforms	32
-elephant	32
-court	108
-hercules	4
-kimmelman	4
-personnel	32
-levesque	2
-destructive	32
-taking	32
-noyce	4
-pilliod	8
-fooling	32
-delmas	14
-dividends	44
-shortchange	32
-echelons	32
-fast-growing	32
-captures	32
-less-favored	32
-borrowings	34
-genrad	4
-issuing	32
-pedro	4
-mollusk	32
-genral	8
-d&k	14
-bookings	32
-battery-processing	32
-component	32
-coppee	4
-multitude	32
-dark	32
-urgent	32
-exquisite	32
-terence	4
-listings	32
-mitzvahs	32
-lake	36
-repackaging	32
-pleasant	32
-make	108
-lawrence	14
-aggregate	32
-kicked	32
-manner	32
-diming	32
-boston	14
-tortillas	32
-fayva	4
-studies	44
-favored-nation	32
-signing	32
-explicit	32
-carley	8
-manned	32
-unix	4
-breaches	32
-crimes	32
-unit	108
-brasilia	4
-proteon	4
-studied	32
-ultimately	34
-then-mayor	32
-superhot	32
-kurlak	12
-trains	32
-or	102
-documents	34
-promise	32
-accompanying	32
-renowned	32
-tilt	32
-caveats	32
-stressful	32
-foothold	32
-portrait	32
-nail	36
-gross	32
-oc	4
-om	8
-honors	32
-oh	14
-more-open	32
-current	102
-retirement	36
-exception	32
-beau	32
-optics	32
-bear	38
-look	38
-influence	32
-holds	104
-asset-management	32
-guide	36
-titled	32
-more-modest	32
-roundtable	4
-assert	32
-fend	32
-titles	32
-extraordinarily	32
-saga	4
-courtney	4
-organizations	32
-fidelity	14
-myers	12
-strikes	32
-relationships	32
-abandons	32
-malveaux	12
-no-strike	32
-ex-officers	4
-hudson	12
-presentations	32
-pontiac	14
-camden	2
-largely	32
-candy	36
-disastrous	32
-lansing	12
-special-operations	32
-publishing	46
-funds	46
-year-to-year	32
-significance	32
-colleges	32
-regrets	32
-genetically	32
-city	46
-gym	36
-visas	96
-parable	32
-retains	32
-connected	32
-gigantic	32
-irvington-on-hudson	2
-overtime	32
-belongs	32
-limited	36
-davidoff	12
-gulyas	4
-scrymgeour	12
-lingering	32
-plummet	32
-weaknesses	32
-forwarded	32
-nhrd	8
-interest-sensitive	32
-integrating	32
-tightening	32
-acutely	32
-trombonist	32
-evil	32
-flopped	32
-documentary	32
-forwarder	32
-erupt	32
-shurkin	4
-ruesselsheim	4
-colorado-brewed	4
-vindicated	32
-unlucky	32
-battles	32
-north-south	32
-baffling	32
-lng	4
-became	96
-father-in-law	32
-courter	12
-lnc	4
-left-leaning	32
-insensitivity	32
-ballooning	40
-phase-down	32
-transportable	32
-processini	4
-antibiotic	32
-processing	32
-promotional	96
-two-year-old	32
-mouthpiece	32
-poses	32
-easiest	32
-mishaps	32
-upgradings	32
-royko	4
-proportion	32
-providing	36
-australia	14
-penchant	32
-equitable	38
-dropping	32
-intimidated	32
-uncounted	32
-puppeteers	32
-lisa	4
-s.s.	4
-falcone	4
-certainly	34
-marginal	32
-empress	32
-phosphate	4
-unscrupulous	32
-genius	32
-inside-information	32
-leans	32
-fluctuations	32
-executed	32
-counsel	96
-rhetoric	32
-releasing	32
-bar-illan	12
-desist	32
-consequences	32
-preferences	32
-moline	4
-laughed	32
-asthmatics	32
-theatre	4
-proposing	32
-gibson	12
-precedent	32
-broadway	4
-cutting	32
-rosalind	2
-applicants	32
-thriving	32
-accumulated	32
-pentagon	14
-screened	32
-outlet	32
-maine	4
-sounded	32
-commodity	46
-seals	32
-aerosol	32
-horoscopes	32
-subsidiary	100
-oversight	32
-grind	32
-msft	8
-downhill	32
-pty	4
-midnight-to-5-a.m.	32
-full-financing	32
-hoe	32
-utility	36
-eliminates	32
-concurred	32
-funaro	12
-disaffection	32
-weigh	32
-agios	4
-hop	32
-secure	32
-slightly	36
-hot	34
-how	46
-hilarity	32
-attorneys	42
-tokyo	14
-fortney	8
-rebellious	32
-commercial/investment	32
-governance	32
-tapped	32
-recapitalization	36
-diuretic	32
-griffith	4
-petronius	4
-gunshots	32
-rallied	32
-formally	32
-artist	32
-maneuverings	32
-resorting	32
-bbf	8
-lagged	32
-sponsoring	32
-deferring	32
-resigns	40
-studios	36
-bkne	8
-clears	12
-affluent	32
-armonk	4
-early-warning	32
-shorter	32
-maryland	4
-meat	100
-probed	4
-dwellers	32
-avoid	36
-seeing	34
-battery	32
-mean	32
-shorted	32
-likewise	34
-obsession	32
-party	36
-mead	4
-shorten	32
-protein	32
-deerfield	12
-deja	32
-soviet-backed	4
-unocal	6
-posters	32
-high-powered	32
-attention	32
-collapsing	32
-sectoral	32
-longs	32
-evade	32
-examples	34
-andy	4
-lending	32
-restructuring	44
-box	96
-write-offs	32
-weiss	4
-shade	32
-bog	32
-wilkis	12
-hammer	36
-hrs	4
-sophisticated	38
-shocking	32
-d.w.	4
-absorbed	32
-american	14
-california	12
-forecasting	36
-americas	4
-landing	32
-hrd	4
-indemnify	32
-keels	32
-condemning	32
-insights	32
-whirl	32
-deere	6
-pre-split	32
-lefty	4
-privatized	32
-fdic	6
-outsiders	32
-resolutions	32
-bargains	32
-tower	44
-tarnished	32
-daley	12
-underwater	32
-contamination	32
-homo	4
-petrol	32
-scandinavian	4
-boeing	14
-head-to-head	32
-unspoken	32
-home	46
-elevators	32
-assiduously	32
-bingham	4
-kilns	32
-plane	32
-alleged	32
-dartmouth	4
-institute	36
-among	46
-knox	4
-predominantly	32
-masayoshi	4
-elect	32
-reichstuhl	4
-bala-cynwyd	4
-foreshadowed	32
-complexion	32
-codes	32
-plant	108
-pioneer	46
-brew	32
-creators	32
-cells	32
-plans	110
-posed	32
-bilion	32
-sandbagged	32
-ill.-based	4
-arbs	32
-trash	36
-thousands	38
-garrett	14
-elkes	8
-recommending	32
-appraised	32
-florida	12
-haggarty	4
-confair	12
-enviromental	32
-horwitz	4
-lends	32
-individual	32
-ponderous	32
-dangerous	40
-vying	32
-courtier	32
-well-trained	32
-posting	32
-fatigue	32
-janeiro	4
-radically	32
-accusatory	32
-tsuyoshi	4
-shopkeeper	32
-zwerin	12
-careful	32
-shrieks	32
-unusual	32
-towers	32
-eagleton	12
-foresees	32
-fever	36
-lasts	32
-vouchsafed	32
-binlc	8
-traditionally	34
-on-call	32
-principle	32
-refunds	32
-propose	44
-replacement	32
-proportionate	32
-correctly	32
-saying	36
-disapproved	32
-severely	32
-dealers	34
-roberto	4
-melville	8
-roberts	12
-tacoma	8
-through	42
-cheerleaders	32
-doom-and-gloom	32
-hipps	4
-prime-time	32
-fdx	12
-wired	32
-exposure	32
-electoral	32
-than	100
-equation	32
-entertainer	32
-scuttled	32
-sikh	6
-bloods	32
-regained	32
-sabotage	32
-thinker	32
-shalala	8
-organ	32
-double-a-3	32
-double-a-2	32
-encouraged	36
-irvine	14
-irving	6
-coal-management	32
-warfare	32
-unfairly	32
-cameroon	4
-red	36
-wonder	36
-facets	32
-goldman	14
-wave	32
-sodium-borohydride	32
-flesh	32
-abbott	14
-permitted	32
-bikers	32
-silibis	4
-cancelling	32
-rash	32
-notre	4
-doubled	36
-italian	14
-judaism	4
-irksome	32
-blasted	32
-mendham	4
-standoff	32
-breached	32
-islands	36
-cwm	8
-outraged	32
-imagination	32
-henceforth	32
-sport	32
-triumphant	32
-kiplinger	4
-aggravated	32
-sour-cream	32
-arida	12
-antimissile	32
-cent	32
-aurora	4
-balanchinism	4
-idles	8
-transfusions	32
-spartacus	4
-vanish	32
-i.m.s.	4
-wherever	32
-washboard	32
-licenser	32
-licenses	32
-blind	32
-originate	32
-a.g.	4
-identified	32
-sight	32
-shareholder-rights	32
-accept	32
-philips	14
-dickstein	4
-purports	32
-reveal	32
-evaluation	32
-quo	32
-probably	96
-ret	4
-mccarthy	12
-rule-making	32
-point-guard	2
-contraction	36
-information-providing	32
-performances	34
-nutsiness	32
-predecessors	32
-conflicts	32
-alfa	4
-retool	32
-absurd	32
-buys	36
-contradictions	32
-flush	32
-brownsville	4
-motifs	32
-ilan	4
-magically	32
-politics	32
-escalation	32
-primitive	32
-ominous	32
-johns	12
-profile	32
-calif.-based	4
-calling	34
-absence	32
-jeffrey	6
-settlement	44
-shown	32
-gm-csf	4
-gashed	32
-cheerleader	32
-rectified	32
-mobile	32
-newcomer	32
-beverages	32
-contrast	32
-confident	32
-fact	32
-conditioned	32
-snapshot	32
-babble	32
-ringed	32
-aggression	32
-back-office	32
-inspector	32
-array	32
-escalates	32
-kriftcher	8
-blonde-laden	32
-injuring	32
-diploma	32
-designation	32
-permeate	32
-allegiance	32
-bunch	32
-ronk	8
-txel	8
-nakasone	4
-mcniff	8
-lengths	36
-moto	14
-milford	12
-rudolph	14
-city-state	32
-infrequently	36
-knowing	36
-sworn	32
-garbage	32
-barometers	32
-attempting	32
-tlr	4
-holstein	4
-logic	36
-bridesmaids	32
-space-based	32
-slimmer	32
-sensibly	32
-australians	4
-backlog	36
-scenes	32
-president	110
-far-flung	32
-merchandising	32
-entity	32
-one-year	32
-ounce	32
-starship	4
-often-ignored	32
-kathryn	4
-hideout	32
-talents	32
-caine	4
-feedback	32
-ways	36
-replied	32
-implementation	32
-tobacco	44
-follow	34
-standardizes	32
-linkenauger	8
-ghostly	32
-stomp	32
-highest-volume	32
-touchstone	32
-tough-talking	32
-containers	32
-brandon	4
-revocation	32
-seven-nation	32
-denies	32
-hedge	32
-micro-marketing	32
-indefinite	32
-elevator	32
-techniques	36
-ballerina-like	32
-constitutional	32
-e.	4
-squabble	32
-spending	34
-denied	32
-many	46
-orem	4
-ill	36
-personally	32
-quickened	32
-taint	32
-usually	38
-stevens	8
-denying	32
-menuetto	4
-contempt	32
-jordanian	4
-magnetism	32
-unnecessary	32
-bond-trading	32
-shepherd	12
-tended	32
-rosy	36
-sofaer	8
-interviews	32
-rose	44
-distorted	32
-slyly	32
-quotas	38
-dowd	4
-package	36
-tender	44
-connecticut-based	4
-heron	6
-gentle	32
-dame	4
-motor	44
-tylenol	4
-locke	4
-berry	4
-rothman	4
-miscast	32
-position	96
-damn	32
-k	4
-ed	14
-roebuck	4
-wielders	32
-ec	12
-state-funded	32
-en	36
-el	6
-confusing	32
-ek	12
-t.dhc	4
-fevers	32
-rate-increase	32
-donated	32
-gunshot	32
-uchida	4
-consists	32
-iacocca	12
-new-hire	32
-voluntary	32
-packs	32
-provincially	32
-path	32
-tend	32
-haul	32
-isaac	4
-ratios	32
-normick	4
-challenges	32
-hugel	12
-experimenter	32
-inhibit	32
-buttery	32
-rollover	4
-hopelessness	32
-preparedness	8
-foreign-currency	32
-harvesting	32
-renewal	32
-closely	102
-aai	2
-deflation	32
-reintroduced	32
-skim	32
-chm	4
-conservatorship	32
-illegally	96
-kilowatt-hours	32
-walid	4
-tickets	36
-fantasy-at-home	32
-session	40
-chancellor	36
-suppressed	32
-schlumberger	14
-amtrak	4
-aquitaine	4
-subsidies	44
-joneses	4
-medrich	12
-depart	32
-slates	4
-chemicals	40
-royalty	4
-idle	32
-queen	44
-transcriptions	32
-headdress	32
-consideration	32
-behalf	32
-stettin	4
-maury	4
-hyatt	4
-greener	4
-barringer	4
-squeaked	32
-arsenal	38
-elections	32
-skis	32
-felonies	32
-skokie	8
-artel	10
-silicon	36
-losses	100
-discard	32
-kawasaki	6
-petrofina	4
-nutshell	32
-standing-room-only	32
-beets	32
-rebounding	32
-paste	32
-minstar	8
-sheets	32
-teamwork	36
-lovers	36
-zaentz	12
-charleston	14
-race	38
-tax-shelter	32
-pound	32
-stocks	110
-fabric	32
-anti-competitive	32
-reining	32
-pasta	32
-frets	32
-b-1b	4
-rack	32
-year-ago	46
-century	44
-hit	44
-good-natured	32
-meager	32
-hip	36
-trump	14
-overestimated	32
-handicapped	32
-announced	96
-cigars	32
-premark	14
-gas-turbine	32
-marble-floored	32
-silvercrest	14
-walesa	4
-weeded	32
-uneasy	32
-simpler	32
-daily	100
-that	126
-gundy	4
-watkins	12
-expires	32
-tonalities	32
-tape	32
-hindsight	32
-invalidated	32
-penn	12
-differed	32
-pumps	32
-refinery-run	32
-siemens	6
-dmgif	8
-wrong	36
-explode	32
-modifications	32
-expired	32
-seeking	36
-principally	32
-pent	32
-ring-shaped	32
-emphasis	32
-succeed	32
-tragically	32
-going-private	32
-tendencies	32
-performers	32
-disguise	32
-dynamic	32
-casino	36
-craig	12
-sia	12
-contenders	32
-spit	32
-permission	32
-usbk	8
-palmolive	4
-usher	32
-circumstance	32
-unveiled	32
-concerning	32
-expressivity	32
-beautiful	36
-clearances	32
-optimism	32
-break	32
-bread	32
-maligned	32
-wisely	32
-strategy	36
-tagged	32
-acker	12
-telerate	12
-opinion	38
-fares	36
-jyll	4
-breau	8
-tweak	32
-sopranos	32
-romantic	34
-less-costly	36
-flipped	32
-threatened	32
-ratepayer	32
-acting	44
-availed	32
-ago	44
-aga	4
-age	36
-whom	32
-adopts	12
-staffing	32
-wichita	12
-village	32
-creditwatch	4
-howard	14
-destinations	32
-promising	32
-bride	36
-earth-stuff	32
-deutsche	36
-anglo-irish	4
-planted	32
-add-vantage	6
-fischer	4
-commission	44
-dash	32
-extinction	32
-law-abiding	32
-embrace	32
-cool	32
-commodities	44
-implantable	32
-keyt	4
-tool	36
-blockbusters	32
-bruce	14
-controlling	32
-lessor	32
-non-consumer-related	32
-interfering	32
-home-improvement	32
-darby	4
-chase	46
-jennifer	2
-kinderhill	4
-exam	32
-lesson	32
-become	32
-mnst	4
-flop	32
-logistical	32
-parking	32
-russian	4
-sabotaging	32
-innocence	32
-get-up	32
-retire	100
-contesting	32
-strikingly	32
-bimonthly	32
-landry	4
-weatherly	12
-appointing	32
-divulging	32
-traveled	32
-fills	32
-swore	32
-flagrant	32
-graveyard	32
-acolytes	32
-buyers	96
-charmer	32
-prosecutor	32
-jnj	8
-stemple	12
-shattuck	4
-carefully	32
-mooney	4
-edging	32
-clung	32
-ecdysiast	32
-ditto	2
-wheeling-pittsburgh	2
-piano	32
-distraction	32
-junk	32
-felix	2
-mattingly	12
-courts	102
-circuits	32
-deep-rooted	32
-defective	32
-nav	8
-whenever	34
-barish	4
-hurt	36
-twain	32
-correct	32
-food-service	32
-s-corporation	6
-germalists	32
-guarantees	32
-last-ranked	32
-west	46
-gradually	36
-his	46
-o	12
-easter-egg	4
-has-been	32
-accounted	32
-guaranteed	36
-fields	44
-outrageous	36
-high-volume	32
-information-age	32
-treasury-department	32
-usage	2
-thermo	8
-condition	32
-guarding	32
-meters	32
-t3100s	4
-direct	32
-tactics	32
-switch	36
-caribbean	4
-asta	4
-yelena	4
-blue-and-yellow	32
-devoted	32
-obstacle	32
-lawyers	46
-dynasty	4
-athletics	32
-curious	32
-imperatives	32
-mcdowall	12
-ohbayashi	4
-work-force	32
-democracy	36
-transfers	96
-bradlees	4
-award-winning	32
-execution	32
-a.m.e.	2
-blows	32
-bigelow	4
-overboard	32
-comers	32
-philippine	6
-decent	32
-convention	36
-admission	32
-brueghel	4
-dayton	12
-faster	40
-opinions	32
-destined	32
-keizai	4
-conducted	32
-cratered	32
-rozen	4
-recalls	32
-kai	4
-meeting	36
-middle-level	32
-safety-equipment	32
-cost-conscious	32
-acid	32
-appropriate	32
-kab	8
-approve	44
-boosterism	32
-preacher	32
-methyl	32
-misunderstandings	32
-armstrong	14
-masx	4
-oversee	32
-unfortunately	34
-thereby	32
-misgivings	32
-entertain	32
-hertzberg	4
-dashed	32
-accordingly	34
-pox	32
-precede	32
-hyde	12
-professorial	32
-battle	36
-frommer	12
-warburg	10
-sources	46
-within	34
-improve	32
-louisiana	12
-call-ups	32
-expert	32
-licensed	32
-ver	4
-heard	44
-dilson	4
-deloitte	4
-bennes	6
-pocket	36
-gritty	32
-senior	100
-automated	32
-platform	32
-geneva	4
-boorstin	4
-loan-rescheduling	32
-cars	36
-incur	32
-staffers	36
-socially	32
-trumpets	4
-controlled	96
-re-announced	32
-commentator	32
-arizona	14
-atrc	4
-shipowners	32
-third-party	32
-closed-end	32
-balletomanes	32
-role	108
-brennan	8
-harvard-educated	4
-roll	32
-clark	12
-caesars	14
-winning	36
-defense-oriented	32
-robbins	14
-poured	32
-third-place	32
-lessen	32
-move	44
-stapleton	4
-banks	110
-passenger-car	32
-collomb	12
-reseach	32
-tnt	6
-strict	34
-single-a-2	32
-mo	4
-m.b.a.s	4
-solidly	32
-reductions	32
-manning	8
-barker	4
-intro	32
-ours	32
-fiat	12
-mid-1987	32
-distillers	4
-hasn	98
-mid-1988	32
-tnm	4
-intact	32
-duties	36
-philippines	4
-rebuilding	32
-nonbinding	32
-compromised	32
-veto	32
-mitzi	4
-rekindling	32
-sheriff	36
-pacemakers	32
-meshed	4
-murdock	12
-murdoch	8
-pinnacle	4
-township	32
-money	46
-por	4
-haskins	4
-exhaust	32
-guillermo	2
-tacked	32
-allergy	36
-marius	4
-establish	32
-undercharging	32
-flourished	32
-critiques	32
-rearranging	32
-finale	32
-manley	12
-urges	36
-sane	32
-sand	32
-sang	32
-sanf	4
-accountemps	2
-donaldson	6
-syrian-lebanese	4
-unanimously	32
-urged	32
-bluefield	4
-gupta	4
-you	38
-sank	32
-bancorp	4
-baine	4
-killed	32
-trico	4
-riling	32
-trick	32
-forks	4
-puccini	4
-dollar	108
-agreed	96
-dealing-room	32
-strident	32
-trade-deficit	32
-perfectly	32
-r.t.	2
-stronger	38
-linear	32
-stomping	32
-laughs	32
-no	110
-story	36
-nl	4
-b-1	12
-ortner	4
-streamlined	32
-lack	32
-tackle	32
-claim	32
-wolfe	4
-turnover	32
-p	8
-bamford	12
-sequestered	32
-boot	32
-book	36
-egon	4
-reveals	32
-vli	14
-paddio	4
-categories	32
-boom	38
-restrict	32
-syne	4
-decorator	32
-count	36
-mountlake	8
-shows	36
-spadaro	8
-troopers	32
-bulkier	32
-inspired	32
-idealized	32
-hiding	32
-nowhere	34
-tele	8
-telc	8
-hypotheses	32
-typical	32
-inspires	32
-deep-seated	32
-pare	36
-nessen	4
-high-bypass	32
-manageable	32
-arrow	14
-stops	32
-our	38
-pezim	12
-beutel	4
-dormant	2
-appeal	36
-truck-trailer	32
-cozy	32
-sought	44
-enjoins	4
-appear	32
-asset	46
-charlotte	4
-deborah	2
-fade	32
-eminent	32
-computerizing	32
-darman	12
-miles	32
-bet	32
-serenade	4
-bev	4
-sister	32
-insufficient	32
-turtle	32
-journeyed	32
-ovarian	32
-provide	32
-tempting	32
-monopolies	32
-rolls-royce	4
-prestowitz	4
-valuations	32
-walks	32
-voltage	4
-inordinately	32
-colton	4
-bel	4
-dung	4
-title	32
-villain	32
-tested	32
-foundered	32
-proceedings	32
-vittorio	2
-turks	4
-garments	32
-murky	32
-routines	32
-federally	32
-all-too-familiar	32
-motor-hotel	32
-dining	32
-gloomy	32
-health-threatening	32
-g.f.	2
-pattern	36
-coelho	4
-sprinted	32
-heterogeneity	32
-pds	4
-spectacular	32
-species	32
-dinners	32
-flirtation	32
-protracted	32
-wishner	4
-nationalist	32
-ferdinand	4
-talkers	32
-lunchmates	32
-lenient	32
-zen	4
-jill	4
-cross-subsidization	32
-high-profile	32
-zee	4
-tight-lipped	32
-near-term	32
-charming	36
-reserves	38
-spruce	32
-beezley	4
-humming	32
-stew	4
-stiff	32
-step	32
-raced	32
-cit	4
-up-to-date	32
-medical-diagnostic	32
-nosedived	32
-spots	32
-barton	12
-cil	4
-lapses	32
-cih	4
-accommodate	32
-stem	32
-high-yield	34
-pre-world	32
-predecessor	32
-sauteed	32
-free-lance	32
-appetizer	32
-mention	32
-roomier	32
-reaching	32
-minimized	32
-failed	36
-celebrity	36
-nationwide	36
-syrians	4
-football-field	32
-unpopular	32
-zeros	32
-nonetheless	34
-jaques	12
-low-key	32
-greenback	32
-magnuson-moss	4
-true	34
-reincorporation	32
-truk	8
-gutfeld	4
-treasurer	36
-paulson	12
-palatable	32
-warehouses	32
-braniff	4
-edp	12
-pre-nuptial	32
-vitality	32
-calming	32
-starving	32
-ndunduma	4
-simons	8
-rate-economic	32
-taiwanese	10
-prank	32
-warehoused	32
-self-fulfilling	40
-macintosh	4
-yasuhiro	4
-eproms	4
-extendable	32
-authoritative	32
-pill	32
-identify	36
-unconnected	32
-wet	32
-campbelltown	4
-newton	8
-vineyards	32
-couched	32
-moderates	40
-wen	8
-deployment	32
-escalated	32
-financings	32
-wed	32
-wears	32
-ridiculously	32
-pursuit	32
-web	32
-rotterdam	4
-chucking	32
-grant	46
-wondered	32
-market-auction	32
-blunder	32
-testing	96
-penney	8
-commonly	32
-easing	32
-vie	32
-flashier	32
-grand	36
-vic	8
-penned	32
-inflow	32
-deisz	4
-vin	4
-remain	32
-centauri	4
-capital	46
-kohl	4
-inspections	32
-island	36
-missile	108
-suggests	32
-dwindled	32
-region	32
-cycles	32
-draws	32
-hospitals	96
-exclusivity	32
-alters	4
-cash-strapped	32
-show-business	32
-manipulated	32
-convictions	36
-glenview	12
-painfully	32
-heads	96
-olmstead	4
-frankly	32
-subsidiaries	36
-veterans	36
-recitation	32
-mutual-fund	40
-current-dollar	32
-personnel-related	32
-benny	4
-german-supplied	2
-enclaves	32
-across-the-board	32
-a	126
-preserve	32
-readers	32
-bonding	32
-gone	36
-benson	4
-one-shot	32
-softness	32
-red-faced	32
-telling	32
-cut-rate	32
-stress	32
-cautioned	32
-trop	4
-trot	32
-culminating	32
-surviving	32
-troy	40
-somoza	4
-amsterdam-based	4
-behaving	32
-eurobond	4
-conclusion	32
-nikkei	4
-carlyle	12
-lubensky	8
-wagoneer	4
-equals	32
-agents	32
-prohibitive	32
-author	32
-attracting	32
-counselor	32
-interrupt	32
-decoder	32
-pratt	14
-couriers	32
-teikoku	4
-warranties	32
-extra-curricular	32
-smelting	32
-commercially	32
-spokesmen	34
-youthful	32
-reservation	32
-yew	4
-snaps	32
-fried-chicken	32
-equipped	32
-culminates	32
-alaskan	4
-shoot	32
-pride	32
-perks	32
-legally	32
-working	34
-muddling-through	32
-chazov	4
-chef	32
-forest	36
-equalized	32
-charm	32
-other	110
-addressing	34
-health-insurance	32
-crude	102
-safeguards	32
-appreciable	32
-promotion	32
-hostages	96
-films	32
-leblond	4
-u.s.	14
-jmb	4
-quantities	32
-quaintness	32
-tutelage	32
-reshuffled	32
-afrikaans-language	4
-hamm	8
-outlined	32
-thanks	32
-peopled	32
-curran	4
-glassmaker	32
-fare-restructuring	32
-battling	32
-stock-trading	32
-conceding	32
-seesaw	4
-taft	14
-asserted	32
-far	102
-peoples	32
-outlines	32
-overstepped	32
-staging	32
-four-alarm	32
-false	36
-seat-of-the-pants	32
-duplication	32
-billions	32
-handled	32
-personifies	32
-storm	36
-fan	32
-soybean	34
-emulated	32
-insensitive	32
-personified	32
-yield	36
-acceptability	32
-thwarted	32
-worthy	36
-unconcerned	32
-colleagues	34
-urging	32
-vote-buying	32
-accessibility	32
-decoupled	32
-toussie	12
-marginally	32
-roche	4
-butt	32
-salon	32
-reflection	32
-harper	4
-dollar-stocks	4
-opened	32
-aussies	4
-seven-day	32
-clings	32
-reject	36
-entering	32
-unloaded	32
-pantheon	4
-gimmick	32
-fingers	32
-depends	32
-oldsmobile	8
-mystic	32
-sysm	8
-env	4
-car-market	32
-grateful	32
-allude	32
-obligated	32
-tough	46
-shunned	32
-pending	34
-implement	32
-swartz	4
-laundrette	4
-ene	4
-end	36
-multiple	32
-barbella	8
-yeutter	12
-claiborne	8
-most	46
-proprietor	32
-plead	32
-recognizing	10
-spotlight	32
-identification	32
-nobody	38
-mortars	32
-actress	36
-privatize	4
-limousines	32
-driving	36
-saks	4
-demise	32
-defining	32
-confinement	32
-spacious	32
-pleas	32
-downgraded	96
-architect	40
-felon	32
-bittersweet	32
-presumption	32
-farms	36
-franchisees	36
-spinoffs	32
-selected	32
-lacking	32
-man-to-man	32
-rebates	32
-hamilton	4
-gruntal	4
-downgrades	32
-pneumonia	32
-commercial	46
-unproductive	32
-discreet	32
-expiration	32
-pittsburgh-based	4
-next	110
-pha	4
-shamberg	12
-pale	32
-sugary	32
-vacillation	64
-formidable	32
-palo	8
-schierl	8
-zccbs	4
-orally	32
-meaningless	32
-economically	32
-brazilian	14
-ottawa	12
-pals	32
-byrne	12
-code	36
-realized	32
-maxsavers	12
-angels	36
-sparked	32
-clny	4
-showgirl	32
-mystery	36
-proven	32
-restructurings	32
-lillie	8
-intermingling	32
-toledo	4
-proves	32
-realizes	32
-norand	4
-emission	32
-stony	2
-sides	36
-prevalence	32
-streeters	4
-ideally	34
-conduct	32
-acrid	32
-participation	32
-organs	32
-hainan	4
-steelmaking	96
-back-and-forthing	32
-forelock	32
-approximately	34
-mcalpine	4
-debtholders	32
-spillman	8
-rapidly	32
-internationally	32
-dissuade	32
-attendant	32
-prompt	32
-ranky	4
-jr.	4
-tracts	32
-empty	32
-interpublic	6
-n.m.	4
-afl-cio	4
-illness	32
-mindful	32
-alaska	6
-rushing	36
-harbert	4
-witnesses	32
-sanctions	100
-motivations	32
-tim	6
-larry	14
-awaiting	36
-cassettes	32
-name	36
-recitatives	32
-roll-out	32
-assumed	96
-prospect	32
-user	32
-uses	32
-outfitting	32
-touches	32
-selectively	32
-sherwood	6
-assumes	32
-pope	4
-trustee-like	32
-memphis	12
-touched	32
-used	44
-marvin	6
-t.ims	4
-shoe	36
-anti-nazi	32
-blockbuster	46
-islamic	4
-concerns	106
-comprising	32
-oligopoly	32
-anders	4
-jra	4
-rehabilitation	32
-lots	36
-show	36
-grumbling	32
-mccormick	10
-shop	36
-crocker	4
-fleecer	4
-layout	32
-grim	32
-marmon	4
-matthew	4
-arrangements	32
-eyes	32
-add	36
-clumsy	32
-radar	100
-ada	4
-bullish	32
-yang	8
-terminated	36
-spend	32
-low-cost	32
-pivotal	36
-yank	32
-briefs	2
-constant	96
-reeling	32
-contents	32
-transcend	32
-guillotine	32
-alluringly	32
-hussein	4
-september	4
-liberalized	32
-herbs	32
-wages	38
-attendance	32
-potted	32
-larceny	32
-cargo	96
-greater	32
-trees	32
-past	38
-pass	32
-milhollin	4
-provident	4
-dissenting	32
-peasant-style	32
-nonprofit	32
-rewards	32
-t.spz	4
-guiliani	8
-prized	8
-directly	40
-high-risk	32
-high-rise	32
-wiping	32
-travelers	36
-sections	32
-erosion	32
-hampton	4
-alto	4
-nipny	4
-zero-coupon	32
-agricultural	98
-neighborhoods	32
-happy	32
-lockheed	6
-swanson	12
-straight	36
-non-whites	32
-isaly	12
-plump	32
-cooke	4
-plugging	32
-every	102
-prerequisite	32
-shareholder-solicitation	32
-speculate	32
-boston-based	4
-terrific	32
-chocolate	36
-nobility	32
-dime	12
-features	32
-mold	32
-destabilize	32
-belonged	32
-enmity	32
-chocolaty	32
-crowded	32
-equal	32
-ills	32
-featured	32
-dims	4
-holder	44
-trainees	32
-expect	36
-jean-jacques	2
-abrupt	32
-auditing	32
-seidler	4
-called	38
-broadest	32
-customs	44
-oath	4
-check-kiting	32
-youth	32
-sits	32
-jam	4
-quality	36
-swedes	4
-rolls	4
-semiconductors	38
-repented	32
-lifetime	32
-uncontrived	32
-foreign-made	32
-caisse	6
-buzzword	32
-trustees	36
-blend	32
-were	106
-shift	32
-site	36
-holidays	32
-strassner	8
-grains	2
-infocorp	4
-mark	46
-korner	4
-divisiveness	32
-triumphing	32
-marc	4
-wheels	32
-police	38
-suicidal	32
-accomplish	32
-poised	32
-mary	4
-inundated	32
-offsetting	34
-leaks	32
-mart	32
-mars	4
-prominent	32
-naturally	34
-spokeman	32
-masks	32
-niagara	14
-kraft	14
-mississippi	6
-defends	32
-structured	32
-helgi	4
-gleaming	32
-lci	4
-higher-priced	32
-bottom	36
-narrowly	32
-kirchner	4
-desktops	32
-structures	32
-lce	4
-roses	34
-meinertzhagen	4
-justly	32
-ambulances	4
-betting	32
-murray	6
-connecting-flight	32
-stretch-out	32
-guilder	32
-strains	32
-leveled	32
-terminating	32
-reproach	32
-commercials	32
-alluded	32
-refusing	32
-ross	12
-governors	38
-full-fledged	32
-mountains	36
-warrick	4
-proceeding	32
-brussels	12
-beneficiaries	32
-reality	32
-pillar	32
-apawamis	4
-reconsider	36
-unavailable	32
-butane-powered	32
-designs	32
-industrial-policy	32
-mid-1970s	32
-suspicious	32
-ingenuity	96
-shades	32
-yawata	4
-surrounded	32
-neither	34
-nitro	12
-electrical	100
-taxpapers	32
-fein	4
-focusing	32
-washingtonian	4
-lerner	8
-rosen	4
-tokkin	32
-housewares	32
-badly	32
-then-preeminent	32
-application	32
-comedy	32
-service	46
-attending	32
-noises	32
-concentrations	32
-minimal	32
-fialka	8
-chic	32
-comdata	14
-restaurant	32
-knot	32
-know	36
-bette	2
-graves	12
-levies	32
-intense	36
-stream	32
-nonstop	32
-reversals	32
-go-between	32
-cash-rich	32
-thymidine	32
-automobiles	32
-fisher	6
-disney	12
-carpet	32
-necessary	32
-levied	32
-discounted	32
-nonresidential	32
-oasis	32
-pension	36
-champlain	4
-gold	110
-golf	32
-arbitrary	32
-pulled	32
-blytheville	4
-rapids	4
-pulley	32
-solid-waste	32
-screwdriver	32
-absorb	32
-posts	44
-acknowledgment	32
-chestnuts	32
-omnicom	14
-adoption	32
-barrel	36
-hollow-cheeked	32
-barren	32
-wisdom	32
-indicator	36
-missed	34
-officer	98
-kimberly-clark	4
-barred	32
-women	38
-magnavision	4
-what	46
-night	36
-misses	32
-lucrative	32
-ridden	32
-democratic-controlled	4
-crager	8
-celso	4
-parliamentary	32
-curiously	32
-champagne	32
-silenced	32
-graduate	36
-offices	100
-concludes	36
-billboard	32
-lacked	32
-november	4
-tungsten	38
-pressuring	32
-snarl	32
-werner	4
-brainstorming	32
-eagle	4
-lilliputian	4
-outdated	32
-snickered	32
-sis	4
-bonar	12
-colo.-based	4
-railway	36
-ricans	4
-mores	32
-agrees	44
-hicksville	8
-drove	32
-leonard	6
-unaffected	32
-ownership	32
-gambling	38
-single-a/a-1	32
-revealing	32
-couples	32
-weekdays	32
-succumbing	32
-explained	32
-cannes	4
-congress	46
-mona	4
-overtaken	32
-dice	32
-dick	4
-maple	32
-blackmail	32
-herring	32
-investigator	32
-retainer	32
-spanish	12
-mont	4
-goals	96
-arabia	4
-labor-force	32
-shutting	36
-tourney	32
-telemann	4
-inseparable	32
-buildup	32
-real-estate	96
-providence	12
-small	36
-invention	32
-ira	4
-beneficiary	32
-zurich	4
-hitting	32
-tvx	4
-rockets	32
-funniest	32
-re-transmit	32
-inflating	32
-changes	46
-collection	36
-naturalization	4
-build	96
-punk	32
-renominations	8
-network	44
-tully	4
-complimented	32
-unknown	34
-conn.	4
-heading	32
-stretch	32
-surcharges	32
-glutted	32
-gut	32
-transfer	32
-sector	36
-designate	32
-fleeting	32
-lie-detector	32
-cleanse	32
-midland	12
-gun	32
-recipient	32
-ramsey	4
-unfathomable	32
-pre-schoolers	32
-origination	32
-refinements	32
-spillover	34
-berri	4
-churning	32
-midterm	32
-style	32
-rapid	32
-soderblom	12
-lew	4
-troubled	36
-leu	4
-let	46
-les	12
-ugliest	32
-landro	4
-ore.	4
-leo	6
-prosecution	32
-nadzick	4
-theoretically	2
-abandon	32
-self-control	32
-leg	36
-triggered	32
-real-world	32
-led	96
-cookie-cutter	32
-frustration	32
-perpetrated	32
-submits	32
-labant	8
-ipos	4
-armtek	4
-chairman	110
-black-led	32
-transmitted	32
-sparingly	32
-agricultural-debt	32
-swofford	4
-minority	32
-quebecor/british	4
-kbh	4
-disk-drive	40
-maui	4
-shipments	36
-translated	32
-hold	32
-relic	32
-expanding	96
-unprofitable	32
-gorda	4
-prettiness	32
-somerville	8
-ethnic	32
-catch	36
-syndicate	32
-surrounding	32
-departed	32
-enterra	4
-supplication	32
-quicksands	32
-cheer	32
-wood-paneled	32
-employing	32
-intercontinental	32
-year-earlier	38
-enacting	4
-times	46
-summoned	96
-landslide	32
-mocatta	4
-smells	32
-authors	32
-producing	32
-expenditures	32
-shogun	4
-breathes	32
-decks	32
-vein	32
-risk	36
-chiefs	44
-rise	44
-kissing	32
-ex-management	32
-repercussions	32
-sea-otter	32
-supremacy	32
-skywest	14
-because	110
-outset	32
-distressing	32
-aluminum	36
-immunized	32
-divested	32
-affirming	32
-hate	32
-concurs	32
-okla.	4
-bendix/martin	6
-whirlpool	10
-republicans	4
-judiciary	36
-drabkin	12
-icahn	12
-delegate	32
-cbe	8
-penalty	36
-rt.a	12
-morally	32
-moving	36
-fontana	4
-types	36
-brown	46
-latched	32
-life-of-contract	32
-daughter	32
-activist	32
-birth-control	32
-bank-dallas	4
-english-language	4
-president-elect	4
-ventures	36
-honda	12
-british-based	4
-echoing	32
-unbearable	32
-andrea	4
-camps	36
-hancock	8
-kerchief	32
-kilos	32
-mineral	32
-colo.	4
-counterbid	32
-half-million	32
-louis-based	8
-faithful	32
-shortcut	32
-superstation	32
-closed	44
-undisclosed	32
-easily	32
-renting	32
-walls	32
-resolves	32
-revolt	32
-antonio-based	4
-vlsi	4
-iceland	4
-explosion	36
-take-home	32
-suites	4
-minot	4
-minor	38
-trouble-shooter	32
-injuries	32
-mazanec	8
-schedules	32
-indexation	32
-stupefaction	32
-tonic	32
-maytag	14
-divestments	32
-detente	32
-suited	32
-argument	32
-spiraling	32
-collapsed	32
-insider-trading	36
-sprinting	32
-technicare-related	4
-clearly	38
-automobile	32
-zayre	8
-taco	14
-reactor	32
-ineligible	32
-qualities	32
-bourbon	32
-adolph	6
-emoluments	32
-strictly	32
-division	100
-sidelines	32
-smash	32
-va.-based	4
-lambasted	32
-unconventional	32
-conservative	36
-winger	4
-knives	32
-opc	4
-scheduled	32
-fastbacs	4
-shilling	12
-yale	4
-reorganize	4
-accompanied	34
-constrained	32
-acidic	32
-pointless	32
-renominating	32
-winged	32
-severed	32
-cancer	36
-finance	38
-duquesne	14
-single-a-minus	32
-ignorance	32
-possession	32
-trekked	32
-editorials	32
-dave	4
-accommodative	32
-formality	32
-gluing	32
-cancel	32
-barney	12
-tour	36
-state-supported	32
-egyptian-born	4
-spotty	32
-evasion	32
-activated	32
-authorizing	32
-barnes	12
-winston-salem	4
-counteroffer	32
-infection	32
-scrimp	32
-calero	12
-heating	34
-resolved	32
-talismans	32
-entrepreneurs	32
-free-energy	32
-heil	4
-railroad	36
-point	36
-volumes	32
-prefer	32
-correspondingly	32
-carting	32
-marian	2
-musician	4
-byuwi	4
-despite	46
-head-on	32
-stabilize	36
-expansionist	32
-air-defense	32
-opposite	32
-obtaining	32
-abroad	36
-windows	36
-legislatures	32
-state-of-emergency	32
-prevented	32
-mechanism	32
-recapturing	32
-performance	32
-anti-khomeini	32
-flowing	32
-midwinter	32
-enjoys	36
-indignation	32
-foreign-aid	32
-numbers	34
-blast	32
-undermine	32
-kick	32
-whittled	32
-public-debt	32
-allowed	32
-non-russian	32
-hentoff	12
-submit	32
-krock	8
-kanawha	4
-aberrations	32
-blase	32
-peace-of-mind	32
-twenty	6
-inequitable	32
-exploiting	2
-dunlop	8
-marco	4
-malpractice	32
-simplicity	32
-heat-processing-systems	32
-enable	32
-mckenzie	12
-march	46
-w-4	4
-low-fare	32
-treating	32
-maurice	6
-vermont	4
-dien	4
-albn	8
-transit	32
-credit-card	42
-flaxen-haired	32
-syndicator	32
-shorter-maturities	32
-cable	46
-speeded	32
-managua	4
-proverbial	32
-quantity	32
-disposition	32
-third-quarter	32
-photographs	36
-rostrum	32
-conform	32
-immune	32
-extradition	32
-illiterate	32
-grooves	32
-relation	32
-consider	34
-limited-service	32
-cultivating	32
-solving	32
-mellifluous	32
-albert	14
-forecast	36
-classical	32
-motherhood	32
-systemic	32
-luke	4
-pink-marble	32
-assists	32
-unc	4
-maxsaver	6
-vcr	4
-ipg	4
-cln	12
-pick-up	32
-consult	32
-metric	32
-denationalization	36
-tennis	36
-timed	32
-coupon	40
-javett	8
-defined	32
-assist	32
-phaedra	4
-dietrich	8
-balance-of-power	32
-relates	32
-grabbed	64
-defines	32
-chlorthalidone	32
-jenner	8
-disqualify	32
-antibiotics	32
-stamping	32
-panama	4
-homicide	32
-superpowers	32
-duncan	4
-reversal	36
-modestly	32
-newborn	32
-imbalances	32
-def	4
-harrison	6
-handed	32
-write-off	32
-mutter	32
-participated	32
-squirming	32
-resumed	32
-polyethylene	36
-trustworthiness	32
-recording	36
-des	44
-pointing	34
-anchorman	32
-shallow	32
-wertheim	6
-proportionately	32
-toxicity	32
-mauro	4
-allergens	32
-particular	32
-community	38
-armenians	4
-chop	8
-girlfriend	32
-investments	44
-spouse	32
-ironic	32
-chimicles	8
-dogma	36
-branch	32
-curbed	4
-criticized	32
-presumed	32
-underwithheld	32
-surnames	32
-aro	4
-weekly	36
-frest	12
-carmelo	4
-nge	4
-relatives	32
-olympics	6
-vt.	4
-raft	32
-eventually	106
-mysterious	32
-killers	32
-potpourri	32
-these	46
-technicians	2
-capacity	108
-kanabayashi	4
-fresh	36
-marxists	4
-forty-eight	4
-resuming	36
-ripples	32
-quarry	32
-pollsters	32
-chesapeake	4
-instrument	32
-w.w.	4
-orbn	8
-interact	32
-wasting	32
-scr	4
-g.d.	10
-better-conceived	32
-loan-loss	32
-two-count	32
-amanda	4
-simplest	32
-thatcherite	4
-inadequacy	32
-petrodollars	32
-puerto	4
-demographic	2
-definitively	32
-policyholders	32
-saving	32
-defensive	32
-cruise-missile	32
-harvard	4
-vncp	8
-sliding	32
-gay	32
-aphrodisiac	32
-hl	12
-throughout	32
-flowers	32
-bonanza	4
-transferring	32
-exempt	36
-ingelheim	4
-bern	8
-undertaking	32
-violation	32
-saluted	32
-agins	4
-reasoning	32
-segmentation	32
-given	42
-barbie	4
-brraap	32
-closures	32
-highlight	32
-galley	4
-janesville	4
-explainable	32
-cypm	12
-alex	14
-travel	46
-cornell	4
-greenwich	4
-jpi	8
-v	4
-mildest	32
-hoskins	12
-weeks	32
-stumble	4
-love	36
-missiles	36
-accrued	32
-academics	32
-bureaucrat	4
-imposing	36
-five	102
-schoolchildren	32
-arab-israeli	4
-finder	32
-action	110
-curbing	32
-shunning	32
-krauss	4
-consolidate	32
-repetitive	32
-decliners	40
-hers	32
-football	36
-office	36
-surfaces	32
-midsummer	32
-multi-year	32
-ordered	32
-whips	32
-here	106
-medium	32
-persistent	32
-evanston	4
-wealthy	32
-batchelor	12
-hero	32
-developer	32
-banners	32
-strike	36
-txt	8
-txu	8
-relinquished	32
-stamford	12
-contacting	32
-declares	46
-simplistic	32
-drift	32
-down-home	32
-gannett	4
-stall	32
-undue	32
-hundred	32
-grumman	6
-icg	4
-callan	8
-encouraging	32
-ipo	4
-higher-quality	32
-txn	12
-creole	4
-pull	32
-paychecks	32
-cracks	36
-sizeable	32
-pza	4
-defenses	32
-loyalty	36
-blueprint	32
-bilandic	4
-single-a-minus/a-2	64
-maritime-union	2
-examine	32
-invisible	36
-gwf	8
-tattletale	4
-masses	32
-okada	12
-vehicles	36
-hunting	32
-subpeonas	32
-housing-industry	32
-caters	32
-stanleytown	8
-hooray	2
-unassailable	32
-registered	32
-capitalizing	32
-overwrought	32
-feverishly	32
-conspiracy	32
-pressed	34
-connecticut	4
-cholesterol	36
-nev.	4
-recognize	32
-shiny	32
-inconvenient	32
-adds	34
-interested	32
-fought	32
-anomaly	32
-penalize	32
-coldly	32
-supreme	46
-receipts	36
-elegance	36
-argued	34
-are	110
-reiterate	32
-released	32
-sensitivity	32
-prolonged	32
-now-canceled	32
-argues	32
-rough	32
-bloedel	4
-party-plan	32
-assemblies	32
-bloated	32
-insatiable	32
-debt-negotiating	32
-nonessential	32
-senators	34
-zanzotto	12
-anomalies	32
-francois	12
-concussed	32
-farm-credit	32
-cattivera	4
-schiff	4
-amateur	32
-atoms	32
-recalculation	32
-society	36
-basra	4
-three-division	32
-carter-wallace	2
-illiquidity	32
-dyess	4
-brunette	32
-meats	4
-hid	32
-multinationals	32
-shelved	32
-tally	32
-spur	32
-babyboomers	32
-anonymous	32
-neglect	32
-beaches	32
-disheartening	32
-follow-through	32
-gsu	8
-mellon	4
-localities	4
-distinguished	32
-higher-salaried	32
-refunding	32
-desire	32
-net	110
-duzan	8
-microcomputers	32
-distinguishes	32
-revelation	32
-a-from	64
-judicial	32
-pointed	32
-hdl	8
-deter	32
-nec	6
-parfet	8
-ending	36
-ned	6
-laboratory	32
-eastmet	12
-indianapolis	12
-fantasy	36
-ceuta	4
-ohio-based	4
-abortion	32
-sustainable	32
-brass	36
-officially	36
-piece	32
-carryover	32
-minnesota	14
-poorly	32
-modest	32
-bse	4
-mitchell	12
-evidence	34
-nodes	32
-strategies	36
-bust	32
-hawks	4
-taped	32
-acquisitions	38
-nursing	32
-denominations	32
-wab	8
-deflect	32
-shifts	32
-glowing	32
-way	38
-consisting	96
-chauffeur-driven	32
-sale-and-leaseback	32
-tapes	32
-peter	14
-inertial	32
-war	44
-appeasement	32
-then-usual	32
-magical	32
-missile-guidance	32
-initiative	36
-cheapest	32
-hezbollah	4
-rebuke	32
-apologies	32
-laws	100
-rban	4
-perpetually	32
-sachnoff	8
-fellow	34
-long-suffering	32
-noranda	4
-motivating	32
-harvestable	32
-knees	32
-kodo	2
-libraries	32
-experiment	32
-unsatisfactory	32
-lawn	32
-hoover	4
-irritate	32
-limelight	32
-garages	32
-ukman	4
-crews	36
-deed	32
-compound	32
-integrate	32
-deficits	32
-determinations	34
-high-gloss	32
-boustany	4
-bodan	4
-likened	32
-stories	32
-deep	32
-deepening	32
-peladeau	4
-extremely	32
-severence	32
-staying	32
-cuban	4
-noting	40
-reworking	32
-discipline	32
-velta	4
-possess	32
-cigarette-tax	32
-drunks	2
-iran-arms	4
-collider	4
-sable	4
-knappik	4
-physicist	32
-importers	10
-mer-national	6
-accredited	32
-gore	4
-reschedule	32
-tremendous	32
-mystique	32
-mas	8
-arvin	14
-shl	10
-westburne	14
-gutowski	4
-trends	36
-stretched	32
-she	46
-anyone	34
-homeless	32
-shy	32
-ongoing	32
-corp.	12
-towns	32
-mk	8
-rights	36
-philosophizes	32
-minneapolis	12
-plain	32
-cloaked	32
-sanction	32
-presidential	38
-varieties	32
-frame	4
-accessible	32
-recognized	32
-jewels	4
-wife	32
-sphere	32
-eaters	4
-flash	32
-enhancements	32
-courting	32
-balfour	4
-consulting	32
-prompted	32
-recognizes	32
-steamed	32
-du	46
-enclosed	32
-shock	32
-d.c.-based	4
-determined	32
-searing	32
-tumbles	4
-travel-rebates	32
-shenanigans	32
-tsuei	8
-lobby	32
-right-wing	32
-pan-hellenic	4
-specific	32
-tumbled	32
-appendix	32
-high-interest	32
-siegel	12
-fists	32
-cno	4
-calmat	6
-trimmer	32
-cnh	4
-delegates	36
-bargaining	32
-creative	36
-marketing	110
-averages	34
-gintel	12
-overnight	36
-characterization	32
-delegated	32
-arms	100
-scouting	32
-advantages	32
-army	36
-improper	36
-ruth	4
-fortunes	32
-farrakhan	4
-frills	32
-polygraph	32
-allied-signal	4
-predators	32
-signs	36
-maffei	4
-duke	4
-gyrations	32
-conceded	32
-assessed	32
-unanimous	32
-arw	8
-mares	32
-non-contract	32
-superficialities	32
-disturb	32
-longtime	32
-self-respecting	32
-whx	8
-hovered	32
-felt-tipped	32
-concedes	34
-glass	36
-lightning	32
-applauds	32
-maldutis	12
-boroughs	32
-shrank	32
-sponsored	32
-revising	4
-greeley	4
-gruesome	32
-kilroy	4
-cullen	4
-gould	4
-aug.	14
-allanna	4
-hall-walker	12
-wounds	32
-consistently	32
-ortega	4
-rightness	32
-dobbs-higginson	4
-fermilab	4
-familiar	32
-ae-1	4
-towards	32
-mirrors	36
-departures	32
-commitments	36
-blue-chip	32
-nabih	4
-operates	96
-dilorenzo	4
-horelick	12
-m-s-r	12
-netherlands	12
-lasting	32
-emigration	32
-company	110
-public-choice	32
-paternalistic	32
-car-rentals	32
-mikael	4
-operated	32
-kinnear	12
-crossing	32
-awesome	32
-relentless	32
-packed	32
-dissatisfaction	32
-one-on-one	32
-walcott	4
-jokes	32
-chronicler	32
-chronicles	32
-hands-off	32
-switching	32
-supplements	32
-marketers	32
-areas	36
-gypsy	32
-unfolded	32
-dignity	32
-l.f.	4
-indicating	32
-stricken	32
-exceptions	32
-convened	32
-becoming	32
-rallying	32
-hardly	36
-nice	36
-camcorder	32
-erased	4
-tuberculosis	32
-drafted	32
-intrinsically	32
-khalq	4
-cost-plus-fixed-fee	32
-akron	12
-omni	4
-trish	4
-undoing	32
-kelp	32
-well-schooled	32
-ssc	6
-mainly	32
-slow-moving	32
-everywhere	32
-address	32
-abducted	32
-come	36
-runup	32
-s-3a	4
-carthage	4
-integrated	32
-timely	32
-counted	32
-oil-industry	32
-integrity	32
-engineers	32
-counter	32
-officers	100
-lehman	14
-catches	34
-cheesecake	32
-prospects	36
-els	4
-sobbed	32
-exxon	2
-rental	32
-community-center	32
-wheezes	32
-retirees	32
-school	36
-ele	4
-astronomically	32
-elf	4
-eli	4
-foreign-exchange	40
-elm	4
-store-closing	32
-shrink	32
-fabrics	32
-destroy	32
-aboard	32
-mdr	4
-burroughs-wellcome	6
-deploy	32
-commercial-paper	32
-fits	32
-current-account	96
-airborne	36
-challenging	32
-mounts	32
-operate	32
-buddhism	4
-mdc	4
-insurgents	32
-solicitations	32
-entitled	32
-bloodcurdling	32
-higher-ups	32
-exchangeable	32
-demonstrates	32
-aryans	4
-fee	36
-fed	46
-embodied	32
-solutions	32
-poet	32
-third-generation	32
-semi-rut	32
-broderick	4
-houston-based	4
-platt	4
-peso	32
-financial-information	32
-franc-denominated	32
-piscataway	4
-livestock	2
-chorus	32
-struck	32
-drunk	32
-prosperity	32
-industrialists	96
-woods	44
-continent	32
-julie	4
-theories	38
-sounds	32
-dianna	4
-messy	32
-wis.-based	4
-woody	4
-date	40
-bulletin	4
-crowds	32
-horde	32
-erwin	2
-unemployed	32
-exacerbating	32
-yoder	4
-evolving	32
-timber	32
-once	46
-antilles	4
-beaten	32
-tied	36
-maine-based	4
-toilet	32
-plains	6
-already-scheduled	32
-phenomenally	32
-profit-making	32
-appliance	36
-ties	44
-tier	32
-ivory	36
-permitting	32
-quayle	4
-inductance	32
-solve	32
-developments	40
-blondes	32
-ore	32
-varig	4
-biased	32
-clash	32
-earn	32
-pepsi-cola	4
-fostering	4
-efficiently	32
-projectiles	32
-electronics	102
-ironically	10
-switzerland	14
-cbs/fox	14
-heartbeats	32
-eaves	12
-hookup	32
-ahc	4
-handling	36
-fumbling	32
-echoes	32
-cancels	8
-artwork	32
-underway	32
-courses	32
-push	36
-university	36
-jack-of-all-trades	32
-buckhorn	14
-uncontrollable	32
-appreciate	32
-l.a.	4
-counteract	32
-j.j.	6
-confessed	32
-two-by-four	32
-followers	32
-small-volume	32
-curtails	32
-athens-based	4
-g.k.	4
-metal-futures	32
-subjecting	32
-nearer	32
-analytical	4
-diameter	32
-bendix	6
-withhold	32
-focuses	32
-attributed	32
-harry	6
-doodles	4
-bottling	100
-environment	40
-association-college	4
-acute-care	32
-subscribers	32
-boesky-like	4
-buddha	4
-difference	32
-investor-protection	32
-town	36
-atlanta-based	4
-left	44
-german	14
-event	32
-emerged	32
-maritime	36
-treat	32
-decades	36
-incinerator	32
-withdraws	8
-murdered	32
-oscar-winning	4
-craven	4
-norse	4
-detroit-based	4
-reshufflings	32
-northbrook	12
-multiples	32
-whether	42
-mabey	12
-auction-rate	32
-betrayal	36
-cardiss	8
-exchange-market	32
-isle	4
-talky	32
-pollack	12
-knew	32
-zealand	4
-venter	4
-sperm	32
-second-largest	32
-dragged	32
-sells	44
-dis	8
-spontaneity	2
-restrictive	32
-paccar	6
-div	4
-kickback	32
-stage	32
-khj	4
-taxi	32
-coalition	32
-squeezing	32
-interpreted	32
-citicorp-issued	4
-dia	4
-dig	40
-die	32
-issue	36
-streamlining	32
-mouth	32
-curricula	32
-recovery	36
-consent	32
-escan	4
-gigs	32
-holed	64
-speculation	34
-stones	4
-successors	32
-disposable	32
-sweetening	32
-otters	36
-four-month-per-exam	32
-villa	4
-algerian	4
-haughey	12
-writer	32
-writes	32
-blount	12
-cadillac	8
-dotted	32
-listing	32
-y	4
-affirms	4
-dynamic-random-access	32
-lait	32
-four-month	32
-galinsky	4
-railroads	36
-ncr	4
-message	32
-sweetened	32
-mti	8
-boning	32
-unfreeze	32
-americus	14
-ravishing	32
-deadline	32
-indemnifying	32
-hartman	6
-maximum	32
-jewel-like	32
-benefited	32
-desperate	32
-researched	32
-breaks	32
-sudden	32
-lodged	32
-divestiture	32
-pop	32
-talks	44
-influencing	32
-reinterpreting	32
-fizzle	32
-able	32
-aviation	36
-seminars	32
-chevette	4
-junkyard	32
-herzog	4
-md	4
-liquidate	32
-transvaal	4
-broadside	32
-cattle	38
-approached	96
-transactions	32
-liquidated	32
-musical	44
-heavy-water	32
-agree	36
-eyerman	12
-hinge	32
-incorporate	32
-delicious	32
-approvals	32
-cost-control	32
-depend	32
-biochemist	32
-canyon	6
-pares	32
-quotable	4
-market-moving	32
-timid	32
-slaughter	32
-selecting	32
-generale	4
-uncovered	32
-premier	36
-chairwoman	32
-multimate	4
-lottery	36
-according	38
-per	32
-munitions	32
-demands	96
-chemotherapy	32
-mclellan	12
-home-state	32
-acoustic	4
-eligibility	32
-relaxing	32
-slack	32
-managerial	32
-disdained	32
-repackage	32
-wedgestone	10
-phrased	32
-pledges	32
-amadeus	4
-market-share	32
-worldly	32
-jtl	4
-concerted	32
-wrangler	4
-reluctance	32
-pet	36
-pledged	32
-period	36
-firm	44
-loss-sharing	32
-mcdougall	12
-inordinate	32
-compaq	14
-jacques	6
-scientist	32
-microphones	64
-feeding	32
-attractive	32
-sixth	38
-panic	12
-merhige	12
-majority	32
-encapsulating	32
-rankings	32
-rating	32
-spiritual	32
-policy-makers	32
-makeup	32
-rift	32
-shareowner	4
-transport	36
-babcock	4
-ailments	32
-supermarkets	4
-henderson	12
-spins	32
-savaiko	4
-unequal	32
-effectiveness	32
-barris	8
-succeeding	34
-resende	4
-admitting	32
-delivered	32
-succumbed	32
-outage	32
-sep.	2
-wiesbaden	8
-shipment	32
-work	38
-magnets	32
-grist	32
-worn	32
-worm	32
-negev	4
-flows	32
-memories	36
-lethal	32
-marijuana	32
-wore	32
-word	36
-perma	4
-relies	32
-paradise	4
-adjoin	32
-argyll	6
-red-blood-cell	32
-stamina-testing	32
-tabak	4
-toughest	32
-converse	4
-illuminating	4
-acquitted	32
-politically	32
-nbl	4
-hearts	32
-hart-dyke	4
-toaster	32
-diligent	32
-melloan	4
-dusty	32
-hearth	32
-mcdonough	4
-microcomputer	12
-dome	2
-swarms	32
-bureau	36
-approving	32
-moran	4
-moral	40
-silos	32
-statement	32
-interference	32
-glugs	32
-wyoming	4
-bundle	32
-peddling	32
-designated	32
-pursuing	36
-trusted	32
-slopes	32
-irwin	4
-executive	108
-janice	4
-reachable	32
-debtor	32
-damascus	4
-designates	32
-airlines	46
-comedian	38
-legitimately	32
-stormed	32
-proudly	32
-measure	36
-assassination	32
-fuller	14
-monopole-hunting	32
-vacationers	32
-eurosecurities	4
-rented	32
-nominated	32
-detergent	32
-pension-plan	32
-library	4
-young	46
-airliner	32
-danger	32
-stinko	32
-artifice	32
-confronted	32
-gelbard	8
-projections	32
-deals-curbing	32
-deterrence	32
-libyans	4
-oversupply	32
-sterling	36
-memorabilia	32
-nanometrics	10
-certificates	96
-power	46
-pleasure	32
-shkif	8
-mirroring	8
-ellis	4
-exploring	32
-rail	100
-angered	32
-moribund	32
-german-based	4
-tenth-largest	32
-asserts	32
-music	44
-self-destruct	32
-intermediaries	32
-contractors	32
-spirits	32
-create	32
-franchising	32
-younger	32
-ptrk	8
-percentages	32
-lotion	32
-zero	32
-outlays	40
-immunities	4
-mettam	8
-anymore	36
-nakagama	4
-pink	32
-durable	40
-city-based	4
-if	46
-unwilling	32
-id	4
-ground	32
-no-growth	32
-ii	4
-in	110
-parlayed	32
-gate	36
-is	110
-encountered	32
-fortune	36
-iv	4
-watchdog	36
-non-grocery	32
-subpoenaed	32
-deprived	32
-chatted	32
-warships	32
-yoko	2
-futile	32
-euromark	12
-welcomes	32
-onboard	32
-happens	32
-tensions	32
-batus	4
-lamented	32
-technology-based	32
-hodges	4
-fielding	32
-etzioni	4
-tricone	32
-welcomed	36
-inquiring	32
-steelworkers	36
-piety	8
-suh	32
-forecasters	32
-obstructing	32
-sum	40
-royalties	32
-llsi	4
-droves	32
-shut-off	32
-encompassing	32
-atlantis	14
-hewlett-packard	14
-devised	32
-encounters	36
-stanford	12
-mohammad	4
-collect	32
-tribe	32
-devises	32
-atlantic	14
-veterinary	4
-spurring	32
-fight	36
-recognition	32
-wood	36
-unwanted	32
-interestingly	2
-hotsy-totsy	32
-franchiser	32
-franchises	32
-loughman	4
-cypress	6
-barratt	12
-always	32
-frozen	32
-pickup	36
-gargles	32
-grille	32
-warden	32
-job-service	2
-luck	36
-common	110
-disbelief	32
-campsite	32
-luce	4
-touch	36
-employes	32
-brooklyn	4
-generally	36
-feline	32
-arisen	32
-confidentiality	32
-monessen	4
-fbt	4
-mailed	32
-direction	32
-defect	32
-appropriations	4
-autos	44
-extradited	32
-score	32
-reimposing	32
-offenses	32
-painewebber	6
-constructors	12
-awacs	4
-disrupt	32
-inaction	32
-invading	32
-spearheaded	32
-scorn	32
-puckett	8
-forthcoming	32
-fernando	4
-legislating	32
-confronting	32
-field	36
-mired	32
-resourcefulness	32
-manzi	8
-tullock	12
-endorsed	32
-off-peak	32
-gains	38
-decontamination	32
-tourists	32
-flow	36
-frothy	32
-heftier	32
-raymond	6
-picture	36
-weakest	32
-importing	32
-beings	32
-effectively	32
-listens	32
-burenga	8
-substantially	32
-anger	36
-small-scale	32
-misinterpretation	32
-guinness	14
-graduation	36
-cost-savings	32
-shrunken	32
-koss	4
-briefings	32
-fair-skinned	32
-mazankowski	4
-implicates	8
-audited	36
-hambros	2
-climate	32
-jordan	12
-reward	32
-musty	32
-pact	44
-purchased	32
-essence	32
-undeservedness	32
-childless	32
-malcolm	6
-mogul	32
-directs	32
-cftc	12
-petroleumish	32
-buccaneers	32
-purchaser	32
-synthesized	32
-speaking	32
-neidl	4
-clad	32
-cement	36
-portraits	32
-recovered	32
-any	46
-kfvpr	4
-catala-roca	4
-favored	36
-face	44
-tells	36
-product	32
-beyond	34
-ann	14
-ebbing	32
-cohorts	32
-reestablishing	32
-jennings	12
-polls	42
-functions	32
-ana	4
-old	108
-deduced	32
-dwarf	32
-urquhart	4
-assn	4
-recapitalized	32
-jews	4
-stick	32
-throwback	32
-walking	34
-armies	32
-flown	32
-writeoffs	32
-down	38
-trail	32
-stepchildren	32
-amdur	4
-gibbs	12
-alternative	32
-clan	36
-shouldn	96
-temperatures	32
-ducklings	32
-does	98
-credit	46
-parliament	36
-lederle	4
-handleman	10
-answer	36
-brother	32
-smashers	32
-honolulu	8
-bizet	4
-nine-year	32
-found	100
-racially	32
-breeding	32
-ensnare	32
-customers	110
-coliseum	32
-stern	12
-entails	32
-overproduced	32
-revolution	36
-sovereign	36
-distressed	32
-oyster	32
-fitzgerald	4
-trout	4
-minimizing	32
-trials	32
-appearance	32
-deadlock	32
-owning	32
-commander	32
-u.s.-assigned	4
-option	44
-televisions	32
-worksheet	32
-toward	36
-identity	32
-bouncers	34
-bradford	4
-filtration-systems	32
-describing	34
-jozef	4
-rejoined	32
-shamir	14
-amounting	32
-leak-rate	32
-boesky	12
-sword	32
-uncompromising	32
-tolls	4
-budget	110
-metal	36
-machinists	36
-campaign	38
-montana	4
-nucleoside	32
-ministers	32
-ensue	32
-auel	4
-horizons	36
-budged	32
-rejected	32
-stimulates	32
-suspicion	32
-patterned	32
-plunging	32
-midler	4
-anti-drug	32
-upper-income	32
-coatings	32
-paydown	32
-tartar	4
-re-election	32
-illnesses	32
-half	38
-resignations	32
-jack	14
-hapless	32
-hears	32
-morbid	32
-injunctions	32
-willis	4
-heart	36
-degree	32
-composer	32
-afoot	32
-farmer	36
-human	36
-atv	4
-salivating	32
-diversified	100
-campground	32
-wheat	34
-doubts	36
-jaruzelski	4
-prat	4
-extraordinary	32
-sylvester	4
-composed	32
-life-sized	32
-weapons	96
-globe-trot	32
-amerika	4
-register	36
-patently	32
-dc-9s	4
-olympic	4
-enjoy	32
-removed	32
-tribune	4
-archbishop	4
-gene-splicing	32
-panitz	12
-oakland-based	4
-headquarters	36
-owes	32
-emeritus	32
-andrew	6
-grappled	32
-goodrich	12
-corona	4
-engineering	100
-directorate	32
-installed	32
-spies	32
-exploitation	32
-massive	34
-kingdom	4
-mrs.	14
-constitute	32
-deploys	32
-cane-sugar	32
-morality	32
-bartlesville	8
-london-based	4
-bernard	6
-hometown	32
-equaling	32
-havre	4
-fat	32
-symptoms	32
-whiner	32
-alloys	32
-big-name	32
-fao	4
-steeley	12
-aristar	4
-blindfold	32
-vacancies	32
-seasonal	32
-faa	12
-quebec	14
-fab	4
-anne	4
-audit	32
-farmland	32
-floated	32
-liberalize	32
-subsequently	34
-ncaa	4
-del	14
-pritzker	12
-backstage	32
-merger	100
-jewishness	4
-harbor	36
-floats	32
-implicit	32
-viable	32
-aroused	32
-audio	32
-labor-relations	32
-vista	4
-weinberger	14
-tortured	32
-consistency	32
-massachusetts	4
-borrowers	32
-abusive	32
-revolutionaries	32
-book-to-bill	32
-saint	14
-crowned	32
-cincinnati	4
-stupidity	32
-legal	44
-permissive	32
-epileptics	32
-snared	32
-owed	32
-recruit	32
-post-teledyne	2
-racqueteers	8
-scullin	4
-defrauding	32
-prospekt	32
-choreographic	32
-fabrication	32
-jury	36
-unhappiness	32
-workout	32
-fame	32
-wryly	32
-vehemently	32
-considerations	32
-currently-available	32
-furnace	32
-jeremy	2
-gas-fired	32
-ald	4
-nauman	4
-one	46
-reorganization	40
-begun	38
-all	46
-tiring	32
-annoucements	32
-jumping	32
-gardner	4
-ono	4
-rdc	4
-alloy	32
-kallshian	4
-amstrad	14
-eagerness	32
-carbon	32
-eloquence	32
-piping	32
-july	6
-predicament	32
-cancellations	32
-budgeted	32
-fall	36
-persian	4
-deductible	32
-vino	32
-boycott	32
-achieve	32
-malmstrom	4
-passage	32
-enactment	32
-contentious	32
-cookies	32
-mall	32
-ahead	40
-hikes	32
-silva	12
-sikhism	4
-single-a	32
-early-loss	32
-viewership	2
-oversized	32
-theresienstadt	4
-so	106
-enzyme	32
-replicate	32
-coastal	4
-trost	4
-pac-man	4
-combines	36
-happen	32
-welch	8
-underwriter	32
-eight-tenths	32
-gap	36
-combined	34
-teen	32
-chu	8
-rep.	6
-conductor	32
-flawed	32
-midnight	32
-scalia	12
-witkowicz	4
-camera	32
-steps	36
-dies	32
-susquehanna	14
-diet	32
-instinct	32
-attended	32
-h.	6
-conviction	32
-tax-deferred	32
-rivalry	34
-offset	40
-delves	32
-seek	36
-died	38
-dismiss	32
-eprom	4
-burnham	4
-physical	32
-coincidentally	32
-flammable	32
-itt	6
-uno	4
-its	46
-ansberry	4
-investigated	36
-fgsv	4
-unplaced	32
-avenue	36
-private-sector	32
-und	32
-communities	36
-pleaded	96
-texas	14
-breach-of-contract	32
-itc	4
-sarney	12
-enriching	32
-flourishing	36
-rooms	32
-investigates	4
-he	46
-professions	32
-redistributing	32
-high-volume-dependent	32
-ha	8
-rile	32
-souring	32
-inititiated	32
-cocoa	38
-hi	4
-greatly	32
-scion	8
-ht	4
-maze	4
-dials	32
-lease	32
-spawned	32
-arlington	4
-mimic	32
-nightmare	36
-kika	8
-a&m	4
-affect	36
-kike	32
-johnnie	4
-profitable	34
-shrinks	32
-briefed	32
-unionized	32
-appointments	32
-decentralizing	32
-obscured	32
-midyear	32
-scratch	8
-genesee	4
-discrimination	32
-transformed	32
-der	36
-gsx	8
-alleges	44
-releases	32
-alternatives	32
-comrades	32
-mcgibbon	4
-brock	14
-radar-jamming	32
-baja	4
-numerical	32
-weathermen	32
-ignores	32
-unemployment-insurance	32
-epitaph	32
-technicare	4
-tales	36
-commissions	34
-cook-off	32
-petition	32
-adopt	32
-announcing	32
-guiding	32
-opportunity	32
-complacency	32
-bypwi	4
-brewer	32
-epiphanies	32
-new-product	32
-propulsion	32
-tongue-twisting	32
-helm	32
-testimony	38
-smugly	32
-academe	32
-quash	32
-working-class	32
-academy	4
-modernization	32
-donate	32
-maher	4
-wesson	4
-fear	32
-illicit	32
-totaling	40
-land	36
-lane	36
-framed	32
-nsc	12
-original	32
-drafts	32
-government-related	32
-nitze	12
-shannon	4
-slump	36
-lengthy	32
-armour	4
-one-half	32
-brocade	32
-hoops	32
-pieces	32
-nat	4
-increments	32
-selection	32
-arrest	32
-rethinking	32
-lyon	12
-emerges	36
-quick	46
-fleet-replenishment	32
-creations	4
-nad	4
-nag	32
-eagerly	32
-respectable	32
-pieced	32
-threat	32
-tax-related	32
-naj	4
-robusta	4
-quota	32
-defeat	32
-shultz	14
-insystec	4
-ask	34
-smoothly	32
-criticisms	32
-coercive	32
-complicated	32
-hookups	32
-airspace	32
-hopes	36
-bloodletting	32
-stamped	32
-state-of-the-art	32
-patrol	36
-ast	4
-ill-timed	32
-joined	32
-limitation	32
-dealer-manager	32
-expressions	32
-afterwards	32
-chronar	8
-softened	32
-harmony	36
-morocco	4
-finanza	4
-punitive-damage	32
-cheap	36
-roscoe	4
-sensation	32
-scope	32
-cheat	32
-burrowing	32
-mid-afternoon	32
-unwritten	32
-faster-than-expected	32
-russians	4
-exclusively	32
-readiness	32
-twins	32
-bagged	32
-sadness	32
-airplanes	36
-asian	4
-body	32
-interco	4
-decor	32
-whitman	12
-underpayment	32
-bemoaned	32
-worcester	4
-becker	4
-production	38
-titan	4
-pilkington	4
-sweat	32
-hanover	4
-hard-line	32
-tackles	32
-quasar	4
-collaborate	32
-decof	4
-celebrations	4
-expenditure	32
-lieberman	8
-cont	8
-stewart	12
-child-care	40
-marxist	4
-wholly	32
-establishing	36
-suitcases	32
-replicated	32
-j.r.	2
-comprise	32
-cone	4
-depressed	32
-steward	32
-reprint	32
-revive	36
-wellcome	14
-m&m/mars	4
-hiring	40
-fallen	32
-brazenly	32
-absb	4
-forceful	32
-unilateral	32
-retrospective	4
-espn	6
-mini-recession	32
-jurisdiction	32
-broke	32
-worth	36
-worthwhile	32
-ladenburg	4
-flirts	4
-govil	4
-well	38
-kay	4
-influx	32
-goading	32
-n.y.-based	4
-wci	8
-dec.	14
-hobbled	32
-successfully	32
-uc-santa	2
-c.i.t.	4
-cough	32
-perceived	32
-lies	32
-bqr	8
-improperly	36
-landscape	36
-smuggle	32
-unpublished	32
-grudgingly	32
-detrimental	32
-atswi	4
-skinny	32
-aimed	32
-narrowed	36
-dominates	32
-melodrama	32
-teaching	32
-borden	14
-dwindling	32
-eagan	4
-nordic	4
-placing	32
-claimed	32
-romilly	2
-border	36
-information-services	32
-h.f.	4
-wade	36
-dominated	44
-speculated	32
-jourlet	4
-squash	34
-unpaid	32
-witter	4
-bends	32
-share	36
-chief	46
-flow-of-funds	32
-securities	110
-computer-guided	2
-surpass	8
-bounties	32
-johannesburg	4
-one-megabit	32
-copenhagen	8
-smiles	32
-anmc	6
-laroche	8
-sports	36
-transcribed	32
-burton	4
-proud	40
-computerized	32
-valve-stem	32
-faulty	32
-aide	36
-cbs	14
-minuscule	32
-mixes	32
-spectrum	32
-gruneich	4
-clifford	4
-curve	4
-eyebrows	32
-concealed	32
-woes	36
-courtesy	32
-sizable	32
-choice	32
-feisty	32
-coordinating	36
-delighted	32
-lecturer	32
-bullet	4
-telegraph	36
-late-afternoon	32
-sharper	32
-denes	4
-stockholders	32
-wyss	8
-winkler	4
-workstation	32
-marie	4
-austere	32
-combine	32
-resilience	32
-helium	32
-maria	4
-last	110
-preeminence	32
-stepped	32
-pro	32
-unsecured	34
-photographers	32
-wild	44
-minority-owned	32
-blood-analysis	32
-specializes	32
-variations	36
-cease	32
-cost-effectiveness	32
-abitibi-price	6
-phyllis	8
-overhaul	36
-prod	32
-studying	32
-mass	36
-amount	32
-maynard	4
-laurdan	4
-garden-variety	32
-turboprop	32
-fives	32
-specialized	34
-another	110
-bermon	12
-surrogate	32
-tax-withholding	32
-restrain	32
-kyotaru	12
-drug-company	32
-growth	38
-insisting	32
-z-holiday	16
-concise	32
-kin-deep	32
-elaborate	32
-obsessed	32
-invulnerable	32
-circumventing	32
-noted	32
-shriners	4
-walked	32
-above-market	32
-u.s.-made	4
-exporting	36
-suddenness	32
-noticed	32
-start-up	32
-gade	4
-gadd	12
-lyrics	32
-magazine	44
-dread	32
-simulated	32
-impropriety	32
-journals	36
-prop	32
-surface-to-surface	32
-dream	36
-week-r	4
-double-edged	32
-notices	32
-ordnance	32
-mondschein	12
-wish	32
-cassette	32
-psychotic	32
-n.v.	14
-compounds	32
-truffle	32
-swx	8
-kept	32
-inventories	34
-financial-service	32
-scribbled	32
-palmetto	4
-activists	32
-interchangeable	32
-afternoon	32
-streptokinase	32
-overcharging	32
-prizes	4
-ready	36
-mesh	32
-overpaying	32
-houston	12
-productions	36
-abrogated	32
-boulder	4
-accepted	42
-heels	36
-respectability	32
-shored	32
-stock-manipulation	32
-whitehead	12
-talbots	4
-scarce	32
-tananbaum	12
-nonrecurring	32
-widens	32
-versatility	32
-programmers	32
-stoddard	14
-burk	4
-sensitive	32
-inter-city	4
-clean-coal	32
-deadly	32
-importance	32
-sporting	32
-wis.	4
-rave	32
-mask	32
-extras	4
-farm-workers	32
-energetic	32
-undervaluation	32
-t-1000s	4
-edward	14
-nature	32
-boumedienne	4
-sw.	4
-prohibit	32
-weepy	32
-tentative	40
-awarding	32
-leeds	4
-assuaging	32
-roman	4
-cecola	8
-separating	32
-postal	32
-wetherby	4
-liability	32
-colorful	32
-reoccur	32
-masco	14
-dozen	32
-dismal	32
-raving	32
-turbo-fan	32
-lungs	32
-inflows	32
-mueller	4
-renee	4
-three-month	32
-living	32
-built	96
-photo-identification	32
-iii	4
-dense-pac	14
-divorced	32
-vote	36
-renew	32
-social	38
-corroborating	32
-envelope	32
-surgery	32
-deplorable	32
-bradstock	4
-department-store	32
-heartbeat	32
-department	44
-incestuous	32
-debate	32
-praises	32
-breakup	36
-illusory	32
-vivid	32
-praised	32
-steven	6
-farmworker	4
-marks	44
-nelson	12
-wilson	14
-appeals-court	32
-steve	4
-eight-year	32
-mannequins	32
-sallie	4
-british	14
-apia	8
-issue-oriented	32
-schoonover	4
-systemwide	32
-sponsorship	34
-floating-rate	32
-distressingly	32
-linger	32
-coalesced	32
-capital-rich	32
-hysteria	32
-colored	32
-teck	4
-year-long	32
-tech	36
-history	36
-cutoff	32
-structurally	32
-geranium	32
-kookaburra	6
-consequently	2
-played	32
-robn	8
-government	110
-justice	46
-cds	4
-monrovia	4
-tarkanian	6
-cdn	8
-sthf	8
-ambulance-chasing	32
-circuit	40
-fortunate	32
-truitt	4
-environmental	46
-acquisition	44
-player	32
-wtbs	6
-excellent	32
-sovereignty	32
-afric	12
-tortuous	32
-baalbek	4
-heavy	96
-receiving	32
-duet	32
-assessor	4
-looking	34
-smiling	32
-lilac	4
-authored	32
-intensifies	4
-associating	32
-joel	6
-mandatory	32
-a.h.	14
-lopping	32
-n.c.-based	4
-basie	4
-cope	32
-basic	32
-featuring	32
-freeing	32
-surging	32
-backbone	32
-stake	36
-talmud	4
-popcorn	32
-recounts	32
-insiders	32
-resells	32
-restaurants	32
-comair	4
-cogeneration	32
-basis	32
-bureaucrats	32
-bensalem	4
-redesigned	32
-travers	8
-assad	4
-revue	4
-document	32
-wheel	36
-negotiating	42
-tolerate	36
-overallotments	32
-share-buying	32
-oscar	4
-slickers	4
-pereslavl-zalessky	4
-possessions	32
-redeemed	32
-anc	4
-four-share	32
-pinkish	32
-messrs.	6
-curacao	4
-cooperating	32
-summary	36
-gardena	4
-sheet	36
-dinar	32
-expands	32
-erupted	32
-rand	4
-totally	32
-hbo	6
-attracted	32
-founded	34
-affiliated	32
-short-lived	4
-purity	32
-incidence	32
-ranzino	4
-betrayed	32
-dad	38
-nation	36
-founder	40
-galbraith	12
-olson	12
-home-buying	32
-orange	14
-allocation	2
-four-nation	32
-bourbon-pecan	32
-remarked	32
-enron	4
-inflammations	32
-panhandle	2
-catastrophic-illness	32
-fiddler	32
-harass	32
-copy	32
-stymied	32
-distinct	32
-draped	2
-dancer	32
-dances	32
-week-e	8
-grenade	32
-minimize	32
-hormone	32
-fairy-tale	32
-likelihood	32
-squares	32
-grows	36
-track	44
-einstein	12
-reconcile	32
-futures	46
-danced	32
-misdiagnosis	32
-hammond	8
-ual-hertz-hilton	4
-judgments	32
-discredited	32
-workstations	40
-gibraltar	4
-expandability	32
-rife	32
-plays	32
-whole	32
-densely	32
-facilitated	32
-desirable	32
-hibernation	32
-mrn	4
-regencys	4
-israel	14
-alcohol	36
-rolf	4
-broker-loan	32
-ghana	4
-tired	32
-marrow	32
-bricker	8
-shun	32
-mackinsey	4
-plentiful	32
-carlow	4
-burgeoning	32
-tires	32
-civilization	36
-foreign-controlled	32
-edmar	4
-bottom-line	32
-polled	32
-desirability	32
-sentimentally	32
-shut	32
-mass-manufacture	32
-for	110
-fine-tuning	32
-mnd	4
-radicals	32
-mnh	8
-polemics	32
-erstwhile	32
-herpes	32
-squished	32
-packages	32
-representing	32
-indian	4
-warning	32
-foe	32
-patronized	32
-crewmen	32
-birinyi	12
-gowan	8
-reflex	32
-direct-mail	32
-colonize	32
-offhand	32
-proclaimed	32
-chl	4
-applies	32
-gtech	10
-p.m.	32
-union-backed	32
-fixed-interest	32
-textiles	40
-watered	32
-diverse	34
-sanitation	32
-ruin	32
-correlation	32
-ralph	6
-applied	40
-secretary	46
-epidemiologist	32
-lilly	4
-complains	32
-learned	32
-counterproductive	32
-claimants	32
-anthophyllite	32
-savings	36
-repeats	32
-crary	12
-imperishable	32
-anti-discrimination	32
-transcription	32
-disclosed	36
-rothschild	8
-sometimes	34
-assuming	32
-kickbacks	32
-universal	38
-reptile	32
-depreciated	32
-chb	8
-newer	32
-dibacco	8
-gra	4
-capsules	32
-interventions	32
-prayer	32
-bedeviled	32
-pity	32
-emphatically	32
-abductors	32
-powell	4
-innocently	32
-depreciates	32
-trendless	36
-bail	32
-blow-by-blow	32
-meier	4
-like	102
-donohue	14
-skills	32
-panamanian	4
-carol	4
-request	36
-principled	32
-bwv	4
-jazz	46
-mesa	14
-begged	32
-indicated	96
-librarian	32
-products	100
-boyer	4
-delicate	36
-sreg	8
-honing	32
-waft	32
-boost	36
-greenslet	12
-long-range	32
-angeles	4
-mid-1980	32
-indicates	32
-convertible	36
-resistance	36
-muranyi	4
-big-selling	32
-reacted	32
-commissioners	32
-dealt	36
-skeleton	4
-exercises	32
-clinical	46
-dead	36
-deaf	32
-respite	32
-paso	4
-denver	12
-clearer	32
-dean	46
-scrutinized	32
-holding	108
-dedham	4
-exercised	32
-system	46
-ndx	4
-campaigned	32
-portuguese	4
-services	44
-tijuana	4
-prepare	32
-underlines	32
-complexity	32
-levine	12
-kotzan	4
-locals	36
-strategic	36
-participate	32
-crane	12
-north-central	4
-dissipating	32
-millers	4
-dixie-narco	4
-drops	4
-marketed	32
-vaulted	32
-lucas	4
-efficiencies	32
-prentice-hall	4
-major-illness	4
-unrelated	32
-creeping	32
-christopher	4
-despair	32
-budgeteers	32
-sle	8
-violators	32
-farmers	96
-slb	8
-ankara	6
-fronts	32
-asbestos	38
-stymie	32
-logos	32
-increase	44
-orgolini	4
-rational	32
-sls	8
-principles	36
-humor	32
-sale-leaseback	32
-promptly	32
-sly	12
-disappointing	34
-formed	32
-yardeni	12
-flaws	32
-undertake	32
-conspired	32
-allocations	32
-m	4
-annualized	34
-wide-ranging	32
-wendy	14
-issued	96
-mortgage-related	32
-chapel	12
-joaquin	4
-country	102
-normally	32
-alfa-laval	6
-power-circuit	4
-longest	32
-senses	32
-family	38
-awful	32
-boss	44
-underscore	32
-reluctantly	32
-organization	36
-wise	32
-northeastern	6
-territories	32
-tail	4
-novelist	32
-italy	4
-ingredient	32
-denmark	6
-rehearsing	32
-hirsch	4
-tightness	32
-eventual	32
-proposition	32
-one-day	36
-recycled	32
-nasturtium	32
-expenses	32
-deterred	32
-renouncing	32
-disadvantages	32
-predictions	32
-rattle	32
-speeds	32
-punishment	32
-unpersuasive	32
-seagate	4
-best	36
-politicians	34
-me-too	32
-speedy	32
-demographics	32
-tenants	32
-thick-steel-plate	32
-saxophonist	32
-doling	32
-chesebrough	2
-gardening	32
-overlooked	32
-avoiding	32
-kerr	8
-sampling	36
-burn	32
-yoon	4
-pleading	32
-supermarket	36
-communism	32
-screaming	32
-igam	4
-beer-industry	32
-dazzling	32
-arrests	44
-clippings	32
-ministerial	32
-mezzo	2
-member	32
-solar	32
-widely	34
-fortunately	2
-decreed	32
-previously	34
-reunification	32
-foreman	8
-diminished	96
-chickasha	4
-tax-free	32
-converts	36
-justify	32
-oxygen	32
-askren	8
-additions	32
-amnesty	2
-proceeded	32
-pastry	32
-compromises	32
-crackdown	32
-defects	32
-uncle	36
-castle	36
-leading	42
-skagit/hanford	4
-extract	32
-wurtsmith	4
-entrusting	32
-vain	32
-coached	32
-conclude	32
-third	38
-returned	32
-diagnoses	32
-sunstyle	4
-proprietary	32
-stalking	32
-regimen	32
-four-day	32
-retorts	32
-cold	32
-pronounce	32
-near-quadrupling	32
-disinflation	32
-petzinger	4
-timetable	32
-mostly	38
-spears	12
-buttressed	32
-diagnosed	32
-n.	4
-regimes	32
-colt	6
-ship-to-stock	32
-shocked	32
-gruenberg	4
-frenzy	32
-met-coil	2
-courthouse	32
-twilight	32
-tishman	4
-salisbury	4
-annuity	36
-composing	32
-usx	12
-guenther	4
-rock	38
-occasions	32
-challengers	34
-usa	12
-r.a.	2
-envoy	32
-pre-school	32
-speculator	32
-kip	4
-earners	32
-harnessing	32
-col.	14
-remained	32
-frivolous	32
-cares	36
-provision	32
-decay	32
-homely	2
-impressed	32
-five-member	32
-un-american	4
-antwerp	4
-discovered	32
-massacre	36
-greek-turkish	4
-prepares	32
-demand	102
-bicycle	32
-nws	4
-injury	32
-deregulated	32
-measles	32
-deposit-taking	32
-conditions	32
-hansen	12
-subpoenas	32
-mid-1950s	32
-deferred-compensation	32
-quickly	36
-kan.	4
-thermal	32
-prescription	32
-avant-garde	32
-bmy	4
-tax-driven	32
-assent	32
-none	36
-prediction	32
-development-park	32
-regime	32
-wait	32
-sitting	34
-wouldn	38
-resents	32
-cfp	4
-consented	32
-wrestlers	32
-bonner	4
-signficant	32
-glitches	32
-double	32
-cries	32
-vonder	8
-originality	32
-amps	4
-separately	42
-mechanical	32
-names	44
-martine	4
-shipping	34
-ports	32
-diversions	4
-one-hour	32
-lebanese	6
-councils	32
-expropriation	32
-showing	32
-educating	32
-flourishes	32
-recital	32
-fuehrer	4
-omitted	100
-crushed	32
-tito	4
-jung/brannen	4
-unfolds	32
-stated	32
-stearns	12
-flexibility	32
-rule	36
-denounce	32
-hoffmann-la	4
-carrots	32
-heartland	36
-conjuring	32
-sankey	8
-volatility	32
-twice	36
-niskanen	4
-columbia	4
-stringent	32
-boulevard	6
-reinventing	32
-treasured	32
-violent	32
-ranking	32
-pretax	32
-failing	32
-coaching	32
-herman	4
-named	102
-detainee	32
-two-way	36
-fundamentalists	32
-urgent-care	32
-naysayers	32
-covertly	32
-desert	36
-guesses	32
-spectacle	40
-disembowel	32
-boardwalk	32
-syndicated	32
-unsealed	32
-rubicam	4
-sucked	32
-consolidating	32
-pulling	32
-robert	14
-compression	32
-resolving	32
-totaled	40
-miniscribe	6
-electrons	32
-syndicates	32
-addition	32
-regulating	32
-fowler	12
-center	36
-remarks	32
-worthless	32
-autumn	32
-nu	8
-payments	36
-s&p	14
-rally	36
-prevails	4
-gastropod	32
-sacrificed	32
-kb	4
-lehigh	6
-dorin	4
-lumbermen	36
-viacom	14
-dunce	32
-friends	38
-marietta	4
-breeders	32
-govett	4
-furnishing	32
-doomed	32
-heng	8
-crystal	32
-industrywide	32
-parcels	32
-luxembourg	4
-doilies	32
-austerity	32
-retention	32
-conglomerates	32
-touchstones	32
-fringes	32
-harassment	32
-sprint	4
-stemming	32
-pistol	32
-crates	32
-appalling	32
-teamed	32
-detention	32
-insulation	32
-chipmakers	36
-legions	32
-gridlock	32
-spring	36
-laughlin	4
-pzl	4
-disappointed	32
-strength	32
-autonomously	32
-vaux	4
-winchester	4
-abrahamson	12
-francesc	4
-purveyors	32
-tel	4
-lamboley	4
-special-interest	32
-burlington	4
-ill-considered	32
-sergeant	32
-device	36
-a-6e	4
-copper	36
-litigated	32
-omcm	4
-cuisine	32
-languages	32
-selections	4
-target	44
-calls	100
-prevailing	32
-kagan	4
-dealings	32
-composite	36
-epilepsy	36
-assitance	32
-cd-v.	32
-tick	32
-underfinanced	32
-introduces	32
-grasping	32
-communications	46
-hub	36
-experiences	32
-fruit	36
-divided	32
-east-west	36
-suitable	32
-abdication	32
-indict	32
-fascinating	32
-designed	32
-caesarean	4
-strong	46
-ingested	32
-inched	32
-districts	32
-demonstrations	32
-asian-americans	4
-supervised	32
-designer	32
-components	32
-mansfield	4
-mll	8
-supervises	32
-business	110
-e-flats	4
-inches	32
-mater	36
-transportation	46
-billion-plus	32
-altogether	40
-tcr	8
-glasses	32
-injustices	32
-fascinated	32
-disagree	32
-gtch	8
-calculate	32
-doll	4
-entertainers	32
-valuing	32
-avionics	32
-mnco	12
-scarcity	4
-boosted	96
-profit-margin	32
-contadora	4
-omega	4
-evoking	32
-harassed	32
-missile-defense	32
-scholarly	32
-gallon	32
-yield-driven	32
-sisb	4
-horses	36
-surrender	32
-reunify	32
-displeasure	32
-wgn	4
-slowdown	32
-emphasizing	32
-observing	32
-lend	32
-aided	36
-fling	32
-fair	36
-chambers	32
-monroe	4
-joseph	14
-aiden	2
-sharon	14
-celebrities	32
-b	44
-stability	32
-aides	38
-expressing	32
-brunei	14
-flint	36
-cathleen	2
-occurrence	32
-explanations	32
-citicorp	6
-suppose	32
-ecuador	4
-montgomery	12
-publications	36
-cpas	4
-seller	32
-microbes	32
-power-train	32
-computers	108
-lasser	12
-bollenbacher	4
-divers	32
-marine	38
-falsgraf	4
-airmach	4
-rockwell	14
-leinberger	12
-technical	36
-a.s.	4
-boon	32
-emery	4
-lower	46
-legislative	32
-stiffer	32
-eurodollars	8
-momentum	32
-newman	4
-incentives	32
-r.	14
-nazionale	4
-hotels	36
-wurtman	12
-tax-act	32
-toyed	32
-privately	32
-macy	12
-exhausted	32
-m.d.c.	4
-reveres	32
-noses	32
-cherokee	4
-billiard	32
-remedy	32
-one-way	32
-tear	32
-jerrold	4
-kitchen	32
-norrett	4
-hassenfeld	8
-chairmen	32
-fiercely	32
-devaluing	32
-crnr	4
-ridge	4
-prudential-bache	4
-springs	36
-recycle	32
-team	32
-buddy	32
-took	96
-howorth	12
-sinking-fund	32
-sins	32
-and/or	32
-dreary	32
-camper	32
-robotics	36
-populated	32
-semiannual	32
-middle-income	32
-mosquito	4
-winnings	38
-toyco	4
-contest	32
-wed.	10
-widespread	32
-capabilities	32
-large-scale	32
-barricaded	32
-family-planning	32
-respects	32
-ginnie	14
-receding	32
-unrated	32
-telq	8
-succeeds	96
-contributors	34
-feiner	8
-non-german	32
-isotope	32
-checkup	32
-subtracts	32
-seizures	36
-sne	8
-screams	32
-venezuela	4
-controls	100
-public-relations	32
-thinks	32
-w.r.	2
-part	46
-oversubscription	32
-scenarios	32
-outcome	32
-attorney	108
-rare	36
-fled	32
-jailed	32
-self-interest	32
-brakeman	32
-catered	32
-comprises	96
-amity	4
-wove	32
-t.pwa	8
-singer	38
-batschari	12
-modernizing	32
-conflict	32
-mailroom	32
-dole	12
-oecd	12
-reaping	32
-memorial	36
-auto-focus	40
-cassandras	4
-cray	4
-justifies	32
-flee	32
-subjourneymen	32
-spark	32
-overweight	32
-perpetuating	32
-hamel	4
-justified	32
-entices	32
-advance	36
-kingpin	32
-spare	32
-bluth	4
-theodore	6
-inestimable	32
-f.	4
-cliff	4
-junior	34
-distributors	36
-comedies	32
-understandings	32
-concede	32
-overwhelmingly	32
-tirelessly	32
-jointly	36
-efficient	32
-secret-payments	32
-tell	36
-rodgers	8
-wy	8
-kaul	4
-exists	32
-repairs	32
-battled	32
-personality	32
-liquidity	32
-seizure-prone	32
-refund	36
-pressured	32
-gears	32
-tianjin	4
-remanded	32
-readonly	32
-full-service	32
-disturbed	32
-receivables	32
-compete	32
-truce	32
-audio/video	12
-truck	36
-rudolfo	4
-pressures	40
-out	38
-kiev	4
-sa-7	4
-referee	32
-inherit	32
-rgc	8
-theater	36
-purchases	40
-greyerz	4
-compromise	32
-doubling	32
-ellen	6
-withstood	32
-kiel	4
-maclean	4
-premise	32
-traffic	36
-ring	44
-affidavits	32
-hours	32
-soybeans	36
-lowest-cost	32
-manifesto	32
-arrearages	32
-pioneering	32
-authoritatively	32
-quaintly	32
-unthinkable	32
-convertible-debt	32
-nose	32
-kendrick	4
-balk	32
-ball	44
-balm	32
-emerging	32
-kramer	14
-gmfanuc	10
-brewing	36
-questioned	32
-sneer	32
-targeting	32
-wider	36
-optimists	32
-shearson	14
-dedicated	32
-most-active	32
-boosting	32
-meeting-date	32
-removal	32
-proponent	32
-smaller	42
-opera	36
-temporarily	32
-portrayal	32
-down-and-out	32
-shrinking	32
-ruinous	32
-gilliam	4
-resubmit	32
-awe	32
-generosity	32
-fats	2
-blow	36
-guarini	8
-sonic	4
-censures	4
-recruiting	32
-road	36
-forbid	32
-syrian-backed	4
-souls	32
-headway	36
-dallas-based	4
-biologists	32
-publicist	32
-censured	32
-reprieve	32
-kahn	14
-described	32
-bed	32
-plutonium-producing	32
-baxter	14
-expansion-minded	32
-hewn	32
-aerobics	2
-facilitate	32
-replies	32
-recover	32
-tale	32
-revived	32
-raging	32
-hangars	40
-vault	32
-theatrical	32
-morrison	4
-rankles	32
-breathlessly	32
-armand	2
-image	32
-auditor	40
-trivial	32
-thermos	32
-udayan	4
-obscenity	32
-privee	36
-personal-recognizance	32
-redskin	4
-county	38
-trillion	32
-henkel	12
-wearing	32
-nightclub	32
-clinic	36
-freighter	32
-mordechai	4
-pulpit	32
-finally	34
-programmable	32
-business-type	32
-criticize	32
-bonuses	32
-cyl	8
-stud	36
-often-violent	32
-essay	32
-reuters	14
-renominated	32
-independently	32
-pocahontas	4
-leadership	32
-murphy	12
-advice	32
-comeback	36
-discriminate	32
-fourth-biggest	32
-relentlessly	32
-cm&m	4
-health-care	32
-subcommittee	36
-irancontra	4
-fractured	32
-bud	32
-brook	4
-victoria	4
-display	32
-foods	36
-alike	32
-wooded	32
-disbursed	32
-commrcl	8
-won	108
-woo	32
-properties	36
-sympathetic	32
-metallurgical	32
-although	46
-essential	32
-wooden	32
-buy	36
-govmt	12
-chicago	14
-schoolyard	32
-united	46
-higginbotham	4
-choose	32
-drifted	32
-light	36
-quick-delivery	8
-intermediate	36
-spotting	32
-concretely	32
-closing	32
-endearing	32
-whitney	4
-wachtell	4
-and	126
-resort	32
-spoofs	32
-stripping	32
-coalitions	32
-chair	32
-boosters	32
-bookseller	32
-intensified	32
-foundation	38
-marriage	32
-sakharov	4
-mujahedeen	4
-bank-holding	32
-yielded	32
-premiums	32
-escape	32
-weizhou	4
-lufthansa	6
-whose	36
-kalmus	12
-siege	32
-bank-affiliated	32
-merits	32
-parade	32
-merrill	14
-unify	36
-anglia	8
-loeb	4
-strawberry	32
-mind-set	32
-rotating	32
-raids	36
-lauderdale	12
-vending	32
-ex-military	32
-dune	32
-clothes	32
-bowman	32
-ousting	32
-allegedly	32
-medically	36
-receives	44
-neighbors	32
-moratorium	32
-bmws	4
-floppy	32
-retrovir	4
-halter	8
-continents	32
-allocating	32
-arraignment	32
-club	36
-govern	32
-japanese	14
-lytton	4
-unchanged	40
-uacia	8
-appears	36
-disgorge	32
-meetings	32
-ojay	8
-stipulation	32
-nrc	12
-however	46
-similar	32
-accumulating	32
-alphabet	32
-discount	44
-ruins	32
-ojai	4
-shape	32
-proclaim	32
-foremen	32
-linden	4
-court-sanctioned	32
-wrongful	32
-irresistible	32
-circulation	32
-ax	32
-stopped	32
-stricharchuk	4
-require	32
-ar	4
-as	110
-ap	4
-skirt	32
-fumes	32
-at	110
-au	34
-inclination	32
-gabriel	4
-procedures	32
-an	46
-tamper	32
-al	6
-outer	32
-ab	4
-ac	4
-one-third	32
-ag	12
-ad	34
-hybrid	32
-pl	4
-hypocrisy	32
-foote	4
-amateurs	32
-unspeakable	32
-unrest	32
-since	110
-harvests	32
-aut	4
-spurted	32
-diagnosis	32
-kudos	32
-sex	32
-order-inflow	32
-beautifully	32
-workforce	32
-privilege	32
-remarkable	32
-meatpacking	32
-outboard	42
-piercing	32
-laurels	32
-self-regulatory	32
-yitzhak	4
-slides	32
-multiple-unit	32
-negatively	32
-derive	32
-veered	32
-requested	32
-a-340	4
-uniform	32
-bargain	36
-arched	32
-father	36
-interior	34
-rein	32
-overdose	32
-confiscatory	32
-chooses	32
-plummeted	32
-bernhard	4
-ex-spouse	4
-khoo-controlled	4
-hectic	32
-lawn-care	32
-basil	32
-demanding	32
-pirrie	8
-offender	32
-right	34
-obviously	36
-obscenely	32
-collins	4
-philip	14
-lesuer	8
-knowledge	40
-businesses-newspaper	32
-onus	32
-crusade	32
-marcom	12
-bltb	8
-colline	4
-saig	8
-dispatched	32
-offended	32
-inns	4
-consisted	32
-treasury	46
-chevrolet	12
-oppose	32
-organized	32
-rifles	32
-follow-up	32
-proposes	44
-first-generation	32
-decrease	32
-million	100
-counterpart	96
-expectation	32
-l	4
-antony	4
-margins	36
-proposed	104
-disclosure	38
-obscure	32
-dirt	32
-calumny	32
-gerald	6
-montfrooy	4
-scheme	36
-businessses	32
-sec	14
-parker	4
-ascent	32
-detect	36
-dire	32
-regulatory	36
-fake	32
-atomic	36
-emulate	32
-regulators	38
-bitterness	32
-ignore	34
-combining	32
-parked	32
-floundered	32
-induce	32
-chains	36
-gen.	14
-steadfast	32
-odd	32
-flamingos	32
-resign	36
-atcor	12
-gates	46
-gunmen	32
-rochester	4
-redirecting	32
-refiners	64
-politburo	32
-mohawk	4
-imf	4
-particles	32
-fromer	8
-maxsaver-type	4
-smoking	32
-silly	2
-peaceful	32
-lugano	4
-joao	4
-joan	6
-dickerson	8
-sills	4
-assessment	36
-studio	34
-opposed	36
-reincorporate	32
-iron	36
-puget	12
-hard-to-please	32
-trampled	32
-opposes	32
-underwent	32
-washed	32
-goebbels	4
-sisters	4
-base	36
-doyle	12
-outperform	32
-chocolate-truffle	32
-gene	12
-guatemala	4
-immuno-modulator	32
-tunes	32
-gena	4
-comes	36
-pronouncements	32
-plausible	32
-creditworthy	32
-bass	12
-hiked	32
-philipp	4
-gdp	4
-tuned	32
-worrying	96
-nonverbal	32
-notch	32
-institutionalized	32
-options	36
-placebo	32
-unattainable	32
-handcuffs	32
-regret	32
-beginning	102
-readily	32
-crossed	32
-washes	32
-black	36
-exciting	32
-lancaster	4
-grandfather	32
-bringing	32
-co-author	32
-securing	32
-lunches	32
-awhile	32
-signaling	34
-satisfy	32
-runs	32
-worker-training	32
-proof	32
-bubbles	32
-philadelphia-based	4
-africa	4
-bought	100
-calculates	32
-noticing	32
-frankfurt-am-main	4
-neurological	32
-tangled	32
-outnumbered	32
-soria	4
-walder	8
-throws	32
-dispose	32
-soviet-turkish	4
-dual	4
-apcar	8
-kenneth	6
-muckrakers	32
-useless	36
-genes	32
-vowed	32
-travel-related	32
-elixirs	32
-financier	36
-tigrs	4
-korea	4
-cleaning	32
-programming	32
-exasperation	32
-unbroken	32
-lynch	4
-fences	32
-alexander	4
-lapsed	32
-kennett	4
-townships	32
-mass-produced	32
-boehringer	6
-salaried	32
-a-1-plus	4
-prisons	32
-shoving	32
-dormitory	32
-tax-exempt	32
-civil	44
-appalachia	4
-unpromising	32
-images	32
-campuses	32
-classified	32
-rotan	4
-turkeys	32
-trillions	32
-motorists	32
-ideal	34
-exceeded	32
-alpha	4
-zinberg	8
-pc	4
-empathy	32
-intractable	32
-involuntary	32
-editorial-features	32
-jesuit	4
-half-price	32
-psychiatric	36
-bloomingdale	4
-copyright	32
-bullion	2
-sarre	4
-purse-snatching	32
-camping	32
-industry/government/residents	32
-biden	12
-wayne-gossard	8
-upsetting	32
-races	32
-beverly	12
-cannot	32
-professional-management	32
-low-hanging	32
-flemming	4
-tractor	36
-companies	102
-well-financed	32
-revenge	32
-settings	32
-puffs	32
-rank	36
-soars	8
-hatch	12
-stockholm	12
-marsh	12
-proxies	32
-jon	6
-joachim-ernst	4
-channel	38
-dog	36
-biographies	32
-song	36
-takes	44
-don	46
-really	38
-southmark	14
-greats	32
-sons	4
-dot	32
-dow	14
-wallenberg	12
-faltered	32
-prohibiting	32
-bat-lovers	32
-rationing	32
-rooted	32
-taken	102
-westchester	4
-jobless	36
-catalog	32
-preliminary	32
-bans	32
-ordering	32
-hydromatic	32
-walton	12
-mrmt	8
-moody	14
-platforms	32
-cia	12
-kurds	4
-stumps	32
-bang	36
-band	32
-hunger	32
-inform	32
-problem-free	32
-pay-as-we-go	32
-restraint	32
-centerpiece	32
-incomes	32
-borrus	4
-no-smoking	32
-scotto	4
-their	38
-overtures	32
-lloyd	4
-kuwait	4
-wide-eyed	32
-conservatives	36
-bellevue	12
-reputations	32
-rising	34
-samuel	14
-wermiel	4
-reyna	8
-world-renowned	32
-cper	4
-interpret	32
-coups	32
-opera-comique	4
-won-lost	32
-cs1	4
-passive	32
-placements	32
-during	110
-ingenious	32
-bars	32
-bungling	32
-drugged	32
-twelve-meter	2
-achieved	32
-narrates	32
-bramham	4
-began	100
-fastest-growing	32
-taxable	32
-chafing	32
-fullerton	4
-amerada	4
-funding	36
-historic	32
-assignments	32
-labs	4
-gil	4
-fmc	4
-lebaron	4
-catalytic	32
-without	98
-budget-minded	32
-prestigious	32
-syracuse	4
-agreeable	32
-wallich	4
-inspect	32
-rutledge	8
-hypocritical	32
-daniel	14
-afinp	8
-nominee	32
-hitler	4
-harmfully	32
-inefficient	32
-aleman	12
-imperative	32
-hangout	32
-income-tax	32
-same	32
-wildest	32
-arms-sale	32
-olvr	8
-feet	32
-fees	38
-shaping	36
-francis	12
-unique	32
-antediluvian	32
-upbeat	32
-burdened	32
-ni	8
-hundreds	32
-separate	32
-anathema	32
-popular	36
-arbitrager	36
-passports	32
-ratification	32
-attract	36
-piper	4
-aplenty	32
-volcker	12
-transmissions	36
-pipes	32
-mornings	32
-speed	32
-extinguished	32
-co-producer	32
-reopening	36
-stemmed	32
-speculators	32
-ice	36
-continental	14
-princess	4
-broad	36
-aquifers	32
-skiing	32
-endeavor	32
-three-week	32
-klm	4
-kentucky	6
-handgun	32
-worry	36
-teacher-graduates	32
-salary	44
-bko	8
-arraigned	32
-ruffled	36
-contortionists	32
-rookie	32
-lion	32
-disclosing	32
-win	40
-year	46
-fleeing	32
-heritage	38
-sweepstakes	32
-supposedly	32
-sobriety	2
-african	4
-asarco	2
-shimbun	4
-illinois	6
-conjunction	32
-multibusiness	32
-noticeably	32
-lavenders	32
-sugar	46
-helen	4
-seizing	32
-amhowitz	8
-laf	4
-cty	8
-hawaii	4
-fetched	32
-revolted	32
-cooperative	32
-small-lot	40
-seward	12
-multinational	32
-suitor	36
-ailment	32
-attitude	32
-dropped	32
-lobbyist	32
-issuers	32
-disciple	32
-sectors	36
-ms-dos	4
-combination	32
-deposits	32
-mainstay	32
-measuring	32
-rulings	32
-resorted	32
-real-life	32
-trace	32
-never	38
-chang	4
-mediating	32
-goldman-kidder	4
-liable	32
-aggiornamento	32
-stafford	8
-j.b.	4
-mentz	4
-interesting	32
-ensuring	32
-chant	4
-relied	32
-moreover	42
-relief	34
-norwood	12
-unanimity	32
-boasted	32
-scuttle	32
-movers	4
-least	32
-georgine	8
-designers	32
-curiosity	32
-willems	4
-contained	32
-foreclosed	32
-recommend	32
-civilian	32
-lap	32
-cemetery	40
-anyway	32
-stature	32
-cities	38
-renominate	32
-abruptly	96
-franey	4
-dose	32
-mcgraw	4
-shoeshine	32
-sherwin-williams	4
-settling	34
-safeguard	32
-value	44
-datron	4
-vulnerability	32
-carter	12
-casting	32
-format	32
-headlines	34
-coke	4
-contras	4
-claude	8
-cartel	32
-aback	32
-thrift	36
-marckesano	12
-kingwood	8
-headlined	32
-houari	4
-rendered	32
-popularized	32
-sadly	32
-work-practice	32
-larouche	6
-starting	32
-reluctant	32
-detroit	12
-parking-lot	32
-leapfrogged	32
-overwhelms	32
-axes	32
-small-to-mid-sized	32
-fraud	36
-academic	32
-which	34
-underpricing	32
-one-stop	32
-fomon	12
-outspoken	32
-amusements	4
-stipulate	32
-corner	32
-hmt	12
-campesinas	32
-vows	40
-barrier	32
-hacienda	4
-segments	32
-atpwi	4
-mercantile	6
-derided	32
-sitcoms	32
-remains	100
-holliston	4
-flanked	32
-recommended	32
-examination	32
-metropolis	8
-respected	32
-first-time	32
-letter	44
-helene	4
-pollutants	32
-unintelligent	32
-expressed	32
-rumford	4
-amounts	32
-dramas	32
-finances	32
-selective	32
-differ	32
-expresses	36
-limerick	32
-evidently	2
-l-dopa	4
-repaired	32
-table	36
-meaningful	32
-vivaldi	4
-expelled	32
-backdrops	32
-maturities	32
-tips	32
-pakistani	4
-discusses	4
-faced	34
-principal	40
-serial	32
-superb	32
-brooks	4
-disappear	32
-intervene	32
-rogaine	4
-pragmatic	36
-cuckoo	4
-altered	8
-baby	36
-high-speed	32
-mkt	4
-rekindled	32
-ground-based	32
-notably	32
-bolder	32
-irish	6
-unjust	32
-hendry	12
-pointedly	32
-relevant	32
-reigning	32
-navy	46
-cocaine	32
-tilley	12
-mccarran-ferguson	4
-rioting	32
-general	46
-businessmen	34
-coincided	32
-pclb	8
-eldorado	6
-small-business	32
-hispanic	4
-simplifying	32
-delftaland	4
-skimpy	32
-loaned	32
-coincides	32
-view	36
-caterers	6
-hopkins	12
-arming	34
-added	38
-blanket	32
-degenerate	32
-connection	32
-bones	36
-performed	32
-pointe	32
-lenin	4
-self-dealing	32
-augustine	12
-downgrade	32
-cats	36
-tax-deductible	32
-dante	4
-lyndon	4
-blockade	36
-faces	36
-solved	32
-bronchial	32
-khoo-shearson	4
-wholeheartedly	32
-decorative	32
-untypical	32
-outweighed	32
-syndication	36
-unluckiest	32
-adams	12
-pioneered	32
-picks	36
-adjust	32
-milpitas	4
-latest-quarter	32
-parkinson	4
-ostensible	32
-pandemonium	32
-disk	40
-nominations	32
-heroes	32
-dish	34
-yoshihara	12
-earnest	32
-reader	32
-velasco	4
-czar	32
-immunex	10
-government-guaranteed	32
-plowed	32
-carrier	46
-abated	32
-disagreement	32
-searle	14
-kaiserslautern	12
-insist	32
-ormrod	4
-angiographic	32
-shake-ups	32
-singleton	12
-abolishing	32
-postwar	32
-chula	6
-ibm	14
-hatton	12
-person	32
-hangs	32
-long-misunderstood	32
-conceptual	32
-david	14
-michel	6
-gerstner	4
-uneventful	32
-ted	14
-ssax	12
-tea	64
-compensating	32
-ten	34
-davis	12
-penalties	36
-federal-assisted	32
-viewing	32
-steel-equipment	32
-filene	4
-saddened	32
-agaricus	32
-anytime	32
-rushes	32
-dulles	4
-ceased	32
-becomes	36
-listen	32
-seaman	32
-independent-production	32
-via	104
-listed	32
-iaea	4
-rebounded	32
-arms-control	96
-mother-of-pearl	32
-trajectories	32
-swaziland	4
-bubble	32
-famous	32
-reach	36
-steamship	36
-conditioning	32
-litton	6
-plastics	36
-requests	36
-supplementary	32
-interim	36
-withdrawing	32
-forget	34
-raytheon	10
-saul	4
-errickson	8
-norfolk	14
-sematech	4
-oppenheimer	12
-rushed	32
-psychologists	34
-vainly	32
-memotec	14
-awaited	32
-moscow	14
-phrase	32
-demurred	32
-euphoria	32
-euphoric	32
-thorp	12
-mayer	4
-draining	32
-retreating	32
-pools	32
-synergies	32
-loud	32
-uic	4
-rightfully	32
-maintain	32
-indirectly	32
-flourish	32
-incident	32
-shrugged	32
-stuffed	32
-restriction	32
-needs	32
-grease	32
-tarrant	2
-province	32
-committees	32
-colleen	2
-various-purpose	32
-colgate	12
-five-day	32
-pickets	32
-outstripped	32
-rico	4
-rich	36
-input	32
-requires	32
-rice	44
-firms	46
-haider	12
-finazzo	8
-rica	4
-vibrant	32
-mid-year	32
-required	32
-satisfactory	32
-unrestricted	32
-underlying	32
-responds	32
-forged	32
-sibling	32
-versions	32
-then-nsc	32
-polite	32
-interpretations	34
-siconolfi	4
-permissible	32
-paunches	32
-note	106
-entailed	32
-injunction	32
-stayed	32
-perseverance	32
-masterly	32
-establishment	32
-figure	32
-grips	32
-penetration	32
-telephone	38
-access	100
-all-seeing	32
-emotion	4
-flynn	8
-phillip	6
-germain	4
-overturned	32
-fishing	32
-decompression	32
-coloratura	32
-firestone	4
-type	32
-accuracy	32
-hemming	12
-stand	32
-near	36
-overthrow	32
-miss	38
-april	4
-one-for-one	32
-takeovers	36
-fox	44
-medical	110
-editing	32
-symbol	36
-rockwool	4
-babies	32
-bbc-brown	2
-differential	32
-psc	4
-maker	100
-makes	36
-weddings	32
-clear-cut	32
-honeysuckle	4
-iowa	4
-aet	8
-choleric	32
-vocabulary	32
-sweetener	40
-circumvent	32
-second-biggest	32
-conception	32
-aversion	32
-stunt	32
-people	38
-rhythm	32
-crybabies	32
-gift	32
-criteria	32
-remembrance	4
-theorist	32
-immune-deficiency	32
-vastola	8
-stung	32
-wildfire	32
-highs	32
-downside	32
-narrow	32
-syndrome	32
-treasures	32
-lykes	4
-sticks	32
-azt	6
-fastest	32
-need	32
-unconfirmed	32
-homeland	32
-ever-changing	32
-cows	32
-singlehandedly	32
-dragnet	32
-material-service	32
-cantata	4
-mcalister	4
-portsmouth	4
-convulse	32
-subscriber	32
-patch	4
-directive	32
-abject	32
-booked	32
-ingredients	32
-despaired	32
-jackie	4
-files	40
-emphasized	32
-sure	38
-tacky	32
-reynolds	14
-nwa	4
-charging	40
-framework	32
-tragic	32
-carver	32
-photograph	32
-unheroic	32
-ebbed	32
-sonja	4
-scholar	4
-customary	32
-furious	32
-interferred	32
-inventor	32
-devise	36
-robot	64
-cash-laden	32
-apt	32
-tennessee	4
-engines	32
-oscars	4
-affords	32
-reserve	38
-palace	36
-abrogation	32
-willfulness	32
-inspection	40
-bullock	4
-scream	32
-teledyne	14
-messages	32
-vbnd	4
-consultant	32
-hans	4
-sophistication	32
-self-taught	32
-maintaining	34
-arcana	32
-hand	36
-hang	32
-four-city	32
-plagued	32
-entreaties	32
-plush	32
-cookie	4
-sideshow	32
-furtherance	32
-rivers	4
-awoke	32
-anatolia	4
-affairs	36
-schubert	4
-aspire	32
-habit	32
-asbestosis	32
-calan	4
-giuliani	12
-prodding	32
-triple-a	32
-industrialist	32
-paralysis	32
-bills	36
-sometime	34
-drained	32
-chops	32
-staked	32
-denver-based	4
-referring	40
-depressing	32
-needles	32
-acceptable	32
-bacterium	32
-honor	34
-mortgages	32
-founding	32
-slap	32
-timbre	32
-inefficiencies	32
-crackpot	32
-universe	36
-mortgaged	32
-plethora	32
-personalities	32
-airport	36
-competing	42
-divests	32
-plum	32
-high-rate	32
-solely	32
-second-front-page	32
-mikva	4
-release	36
-financially	32
-extravagant	32
-chilled	32
-smasher	36
-vacuum	32
-malcontents	32
-realize	32
-vancouver	12
-semans	8
-incomparable	32
-pullout	32
-pemberton	4
-fyffe	4
-painstakingly	32
-notion	32
-enters	8
-triggering	32
-unacceptably	32
-ariz.	4
-threaten	32
-license	36
-skimming	32
-acronyms	32
-bats	32
-bundesrat	4
-threatens	32
-bother	32
-get	36
-stein	12
-athletic	32
-realtors	32
-conversing	32
-understate	32
-traced	32
-hiking	32
-discloses	32
-anti-missile	32
-well-turned-out	32
-zug	4
-mcdermott	4
-inventory/sales	8
-revenue	110
-sentenced	32
-vincent	4
-swaps	32
-yardsticks	32
-marlin	4
-fissures	32
-criminal	36
-winds	32
-rural	32
-five-ton	32
-doesn	98
-coil	32
-stances	32
-buyout	40
-sprawling	32
-evolved	32
-masterson	8
-vessels	32
-jagry	4
-meta	4
-bloomington	4
-faceless	32
-above	34
-male-pattern	32
-insecure	32
-u	12
-plaguing	32
-universally	32
-disparity	32
-unseen	32
-emanuel	2
-step-up	32
-effects	32
-house-backed	4
-occurred	32
-hover	32
-martin	14
-supportive	32
-enrollments	32
-renegotiated	32
-scandal	44
-anti-tank	32
-ritual	32
-subsequent	32
-ivanov	4
-grumbled	32
-presley	4
-tricks	32
-warren	14
-acted	32
-hosts	32
-edible	32
-amicable	32
-congressman	32
-tricky	32
-confrontation	32
-instruct	32
-heartening	32
-cs-1	6
-allergies	32
-bankruptcy-law	32
-stuffy	32
-thoughtful	8
-unglamorous	32
-solution	32
-auditors	32
-surprise	32
-afford	32
-influenza	32
-ambitious	32
-dc-9	8
-workaday	32
-more-expensive	32
-well-placed	32
-survived	32
-top-notch	32
-discussion	32
-surgical/critical	4
-visual	32
-lower-level	32
-forgo	32
-raves	32
-survives	32
-neil	6
-amal	4
-game	36
-smoot-hawley	4
-eco	4
-overt	32
-celebratory	32
-grosz	4
-patients	34
-pittsburgh	12
-surveyed	96
-notes	100
-animation	32
-patrick	14
-rock-and-roll	32
-specified	32
-suppress	32
-installments	32
-practice	32
-doubtless	32
-highly	32
-prolongs	32
-york	4
-u.s.-based	4
-aspartame	36
-slashed	32
-modeled	32
-drive	36
-veggies	32
-incorporating	32
-partner	96
-republican	4
-drahuschak	4
-distaste	32
-slashes	32
-quotation	32
-renaissance	36
-bribery	32
-constraints	32
-leveling	32
-hume	4
-pennsylvania	14
-zero-sum	32
-tobin	4
-niels	4
-harmless	32
-lenox	4
-taurus	4
-productivity-related	32
-cramming	32
-probes	32
-overstates	32
-bricks-and-mortar	32
-intellectuals	34
-cave	4
-damped	32
-miranda	4
-handcuffing	4
-todd	10
-forefront	32
-meantime	34
-corporations	98
-quarries	32
-redoubling	32
-natural-gas	32
-salespeople	32
-detained	32
-depression	4
-van	44
-cupertino	4
-days	36
-ousts	8
-anti-aircraft	32
-constantly	32
-piedmont	14
-occupational	36
-credible	32
-draft	32
-contagion	32
-deciding	32
-khan	4
-considers	96
-light-duty	32
-maturing	34
-concessions	32
-slipping	32
-well-suited	32
-sympathy	32
-variable	32
-pierre	12
-smelling	32
-chinese	4
-offending	32
-e.f.	6
-before	110
-incumbent	32
-realty	4
-depositors	32
-firefighters	32
-administrations	32
-foreign-relations	32
-taxi-bureau	32
-challenged	32
-awkward	32
-modulator	32
-factories	36
-joll	4
-forward	32
-vienna	12
-khoo-related	4
-challenger	32
-rodime	6
-intensifying	32
-mann	12
-auction-market	32
-deemed	32
-canda	12
-watchfully	32
-pitch	32
-posner-controlled	4
-marwick	4
-restructures	8
-assembly	96
-preferred	34
-satisfied	32
-assemble	36
-apparently	34
-overcapacity	32
-industrials	44
-inclusion	32
-seasons	32
-stimulus	32
-conclusions	32
-reinforce	32
-dissent	32
-faintest	32
-innovations	32
-nuk	4
-accumulate	32
-disciplines	32
-profit	46
-edgar	12
-favoring	32
-guard	36
-taverns	32
-assistant	36
-crash	32
-thinner	4
-arch-rival	32
-stock-held	32
-originally	40
-networks	32
-franchise	32
-probe	36
-shook	32
-neutrinos	32
-examined	32
-nike	4
-wondrous	32
-accompany	32
-innocuous	32
-dwarfed	32
-contract-suspension	32
-impulse	32
-examiner	32
-examines	32
-alumnus	32
-dissatisfied	32
-tavern	4
-brannock	12
-harvested	32
-regional	32
-rare-book	32
-twenty-first	4
-clare	4
-kinard	12
-hessians	4
-console	32
-diluted	32
-swallowing	32
-mathematics	32
-halmi	4
-acts	44
-misuse	32
-bracher	12
-ogilvy	14
-merchandise	96
-potential	32
-volga	4
-roundup	4
-dilutes	32
-soil	32
-ven	4
-unexpired	32
-squeeze	32
-ried	12
-easter	4
-contend	36
-robinson	12
-imperialism	32
-kilobytes	32
-swift	40
-behavior	32
-inner-city	32
-tipped	32
-hurry	32
-yvette	2
-angus	2
-accrual	32
-quite	38
-alice	14
-umbrella	32
-suffer	32
-bankruptcies	40
-banner	32
-parking-brake	32
-knifepoint	4
-feedlots	32
-case-by-case	32
-reits	4
-altar	32
-on-time	32
-banned	32
-leisure-time	32
-surfaced	32
-hard-nosed	32
-quits	4
-annihilation	32
-roddy	4
-first-class	32
-sylvan	14
-unruffled	32
-dr.	14
-mid-level	32
-professionsals	32
-correction	32
-perforce	32
-altering	32
-retaliatory	32
-memory-enhancing	32
-fibreboard	6
-rolled	32
-clause	32
-bancroft	14
-syrup	32
-imprisoned	32
-prevent	32
-strongly	32
-referendum	32
-t.td	8
-interferes	32
-corporation	32
-regulator	32
-davidson	4
-aims	32
-guitar	32
-instructing	32
-uneasiness	32
-car-rental	32
-divisions	32
-discussing	36
-adjudicated	32
-thirteen	2
-tourism	36
-oop	4
-remembers	32
-suisse	4
-brrs	4
-regulations	32
-hhs	6
-underestimates	32
-destroying	32
-inherited	32
-administrators	48
-narrowest	32
-psychologist	32
-piasio	4
-appoints	8
-enterprising	4
-agip	4
-three-man	32
-sumitomo-group	4
-european	14
-characterizes	32
-battipaglia	4
-lech	4
-taxes	44
-brobeck	12
-damaged	36
-viewpoint	40
-mulders	12
-end-of-year	32
-swissair	4
-over-the-air	32
-laing	12
-taxed	32
-agio	4
-unable	32
-post-cruzado	32
-humans	32
-millions	32
-weight	32
-zickler	4
-fateful	32
-goldwater	8
-minerals	36
-swim	32
-jay	6
-objections	32
-preeminent	32
-asinof	4
-morse	14
-clowning	32
-just	46
-commute	32
-daniels	4
-constitution	44
-merchant	32
-vernon	14
-alan	6
-insuring	2
-glue	32
-noncompt	32
-caldwell	4
-grapefruit	32
-appreciably	32
-reservists	32
-jan	4
-bells	36
-u.s.-london	8
-mit	4
-loans	44
-property	38
-highway	36
-customized	32
-largest-airline	32
-belly	32
-balanced	32
-excluding	42
-mix	32
-indirect	32
-amassing	32
-jenrette	4
-mid	32
-distrusted	32
-inadequate	32
-payne	12
-min	4
-overblown	32
-mim	4
-balances	32
-earth	36
-debtors	36
-langoni	12
-nichols	4
-ally	32
-workable	32
-garnishing	32
-crafty	32
-vendor	32
-rumor	32
-hoard	32
-printing	36
-hesston	14
-india	14
-carnation	4
-baker	14
-returning	32
-co-equal	32
-coors	6
-initiated	32
-follows	40
-embarrassing	32
-drew	44
-manhattan	4
-protesters	32
-classification	32
-payable	32
-hobhouse	4
-fatalities	4
-beverage	36
-dakota	12
-dreg	32
-heretofore	40
-roger	12
-inventors	32
-pyf	8
-undertaken	32
-chapter	36
-glut	32
-imaginative	32
-worries	36
-sculptor	32
-xp	6
-allied	14
-groupings	32
-frazzano	12
-longer-term	32
-peacetime	32
-similarly	42
-bizarre	32
-sluggishness	32
-librettist	32
-worried	32
-owned-and-operated	32
-infinet	6
-weekend	32
-authorized	32
-minutes	38
-generically	32
-manuel	6
-precious	42
-evading	32
-short-covering	32
-fancier	32
-instant	40
-eisenhardt	8
-pathways	32
-cautiously	32
-marcheschi	12
-intelligence	102
-bhc	8
-diebold	8
-ignoring	32
-frequently	32
-recyclables	32
-advocates	34
-census	12
-earmarked	32
-tx	12
-unsafe	32
-rabbit	36
-kingsborough	4
-emergency	32
-sharpened	32
-tt	8
-tv	12
-molding	32
-th	4
-r-revised	16
-awed	32
-tl	4
-to	126
-fighter	32
-tandem	4
-hammett	12
-expensive	32
-tight	32
-phil	4
-deduct	32
-bashers	4
-idiot	32
-sorting	32
-arthur	4
-biology	32
-field-goals-allowed	32
-presses	36
-when	110
-waste	32
-boasts	32
-exclusionary	32
-referrals	32
-asbestos-containing	32
-steptoe	4
-child	36
-chile	6
-aging	32
-high-income	32
-midwestern	32
-chill	4
-cartoons	4
-sprung	32
-hitherto	32
-swayed	32
-easy-to-use	32
-nonpartisan	32
-opens	32
-owners	100
-happening	32
-frankenstein	4
-first-phase	32
-deposed	32
-t.	4
-rebuffed	32
-suffers	32
-packaging	36
-columnist	32
-pounded	32
-suing	32
-folded	32
-allergist	32
-non-public	32
-derived	32
-firings	32
-aggressive	32
-suspected	32
-elbtf	4
-barriers	32
-semantical	32
-kupinski	4
-else	32
-raoul	4
-derives	32
-towels	32
-felony	32
-transforming	32
-treasuries	36
-felons	32
-sqn	8
-physicists	32
-mines	36
-peck	12
-contacts	32
-poliakoff	12
-strip	32
-texaco	14
-n.y.	4
-critics	34
-subtracting	32
-owens-illinois	4
-supply	36
-predatory	32
-snow	32
-autobiography	32
-streams	32
-lynne	6
-zaibatsu	32
-giacomo	4
-reebok	14
-august	6
-radio	36
-bsbx	12
-animated	32
-protestant	4
-rumored	32
-lytle	12
-gothic	4
-whizzing	32
-beech	6
-depreciation	36
-judith	4
-bdg	4
-situated	32
-researcher	32
-deficiency	32
-clonidine-hcl	32
-rescheduled	32
-northern	46
-eat	36
-ear	32
-wheaton	4
-eileen	4
-parish	4
-predictable	32
-eaf	4
-institutes	36
-vary	32
-criers	32
-handles	32
-explored	32
-booming	32
-bedroom	32
-devotion	32
-bashing	32
-literary	32
-hastily	32
-mtc	8
-sets	44
-seminary	32
-quest	38
-specifics	40
-gentles	4
-mcgann	4
-objectives	32
-thank-yous	32
-surround	32
-hence	34
-quiet	36
-storrs	4
-super	4
-feb	12
-raikes	12
-password	32
-established	32
-seth	4
-kaiser	14
-full-time	32
-exchanged	32
-innocent	36
-jefferies	4
-affirm	36
-non-food	32
-intellectual	32
-exchanges	32
-prospered	32
-introducing	32
-fosters	32
-stupid	36
-manon	4
-vacations	34
-jeep	36
-mrgo	4
-vicious	32
-edison	4
-beryl	6
-brutal	32
-unchecked	32
-whammy	32
-dofor	12
-semi-obscene	32
-honest	32
-ryder	4
-corrupt	32
-four	46
-per-share	42
-quit	36
-utp	8
-wrlt-fm	4
-unfounded	32
-deductibility	32
-strategically	32
-hall	36
-jamail	12
-criminals	32
-riverside	4
-coup	36
-implicated	32
-clay	6
-panels	32
-slashing	32
-leery	32
-hitachi	14
-nicaraguan	4
-consolidated	46
-sparrow	4
-halt	36
-concealing	32
-hitlin	4
-adapt	32
-forrest	4
-journalistic	32
-gottesman	12
-comparisons	32
-yankee	4
-sentiment	32
-applying	32
-asai	8
-coercion	32
-ervin	12
-export-import	12
-sympathize	32
-redd	12
-gulfstream	4
-ehrman	4
-celtic	6
-standing	32
-axp	12
-rate-of-return	32
-endure	32
-doel	4
-invariably	34
-consumer-electronics	32
-self-policing	32
-hiatus	32
-economics	36
-hoare	4
-decline	36
-hard-currency	32
-pilots	46
-joanne	4
-abrams	4
-preventing	32
-dixons	14
-trumpet-filled	32
-curfew-long	32
-contact	32
-minimum	32
-fischbach	6
-facility	32
-harsher	32
-information	100
-winnick	4
-yellow	4
-gaz	4
-dairy	32
-freeze	32
-jardine	4
-one-week	32
-slv	8
-male	32
-traumas	32
-gas	36
-microcosm	32
-greatest	32
-preferring	32
-revitalize	36
-imposed	96
-generation	36
-california-santa	4
-endangering	32
-secrets	32
-notice	36
-accounting	36
-lawfully	32
-accusations	32
-slr	4
-nsm	4
-bleeding	32
-gentiles	4
-knut	2
-multilayer	32
-hustle	32
-porky	4
-stop-loss	32
-bailout	32
-mismanagement	32
-antitrust-law	32
-newspapers	36
-remainder	32
-ranalli	8
-mylan	14
-comment	32
-lodging	32
-mccaw	4
-margaret	4
-mouse	32
-jenny	4
-assign	32
-martial	32
-commend	32
-forster	12
-reinstate	32
-genentech	14
-transplant	32
-health	38
-invalid	32
-surprised	32
-will	110
-broadly	32
-transparently	32
-crude-steel	32
-wilf	8
-cfps	4
-copolymer	4
-chicago-based	4
-peak	32
-av-8b	4
-peat	36
-monteverdi	6
-arranged	32
-undo	32
-surprises	32
-formulates	32
-furnaces	32
-olivier	6
-registry	32
-playful	32
-vocational	32
-nssi	4
-mr.	14
-pannill	4
-thomson	4
-shooting	34
-andover	4
-co-plaintiffs	2
-thrusting	32
-malls	32
-responsibile	32
-misunderstanding	32
-embattled	32
-l.	4
-disbanded	32
-manifold	32
-shad	12
-karma	32
-sensed	32
-hodel	12
-unconstrained	32
-underwritings	32
-declaration	32
-abhorrence	32
-catalyzed	32
-valley	4
-rises	36
-shaw	12
-condoned	32
-seemingly	36
-extending	32
-survival	40
-busines	32
-sweden	14
-prudential	4
-graceland	4
-thinly	32
-perplexed	36
-kelton	12
-aggressively	32
-single-day	4
-contrasting	32
-mid-1990s	32
-mrk	8
-borrow	32
-restated	32
-trundles	32
-mrs	8
-home-grown	32
-beds	32
-expedited	32
-pilot	32
-wide-bodies	32
-treason	32
-growl	32
-grown	32
-breckinridge	2
-expanded	32
-former	110
-subsidizes	32
-moss	4
-runnin	4
-insures	32
-citizens	110
-subsidized	32
-hinted	32
-apprehensive	32
-readings	32
-acerbic	32
-funnier	32
-broadman	4
-highest-paid	32
-unobtainable	32
-pronounced	32
-pool-playing	32
-six-year	32
-yesterday	106
-freddie	14
-impoverished	32
-liquidating	32
-split	38
-influential	32
-simplify	4
-costing	4
-buoy	32
-indicators	36
-replace	32
-gen	4
-covent	4
-inquirer	4
-feared	32
-benighted	32
-moments	32
-amused	32
-octave	32
-spaciousness	32
-park	36
-stripped	32
-anticipate	32
-criticizing	32
-premature	32
-closer	36
-abhors	32
-foundations	32
-benefactors	32
-rainer	2
-pioneers	32
-tendering	32
-pfennig	32
-microwave	36
-photos	32
-submitting	32
-almys	12
-sake	32
-half-decade	32
-insured	32
-procter	4
-creature	32
-qualified	32
-palm-sized	32
-loyal	32
-acceded	32
-street	44
-outright	32
-physician	32
-characterized	32
-f-111	12
-pgloy	4
-executives	102
-visitor	32
-fruitful	32
-papandreou	12
-compatible	32
-job	36
-intraday	32
-series	36
-verifying	32
-slumping	32
-signal	46
-wigton	12
-hp	4
-alfred	6
-page-one	32
-anglin	4
-defection	32
-systems	44
-lipper	14
-w.	4
-creeps	32
-aquino	6
-businessman	32
-durables	32
-socialist	36
-impatiens	32
-x-rays	4
-suggested	32
-caps	32
-contemplating	32
-ps	4
-views	36
-unspecified	32
-niceties	32
-engen	4
-acres	32
-ponnet	4
-intravenous	36
-few	38
-spurgeon	8
-strunk	8
-shortfall	32
-swept	32
-admittedly	32
-undertreatment	32
-pre-election	32
-attractiveness	32
-wx	4
-implication	32
-accommodated	32
-managed	32
-describes	32
-wu	8
-glemp	4
-lip	32
-climb	32
-covenants	32
-actual	36
-stimulated	32
-policy-making	32
-press	36
-lio	8
-cruz	12
-helane	2
-stems	32
-we	46
-parallelism	32
-sprinkel	12
-bred	32
-ticking	32
-lie	32
-savannah	4
-acquiesced	32
-la	46
-stratified	32
-zones	32
-sentence	32
-remedies	32
-benefits	36
-woodside	4
-studiously	32
-acronym	32
-cardillo	4
-each	46
-eliminate	32
-dipping	32
-soady	8
-shish	2
-praise	32
-clamoring	32
-oak	4
-knight-ridder	4
-analyses	32
-lure	32
-sea	46
-ousey	4
-aiding	34
-catering	32
-see	38
-overreacting	32
-wasteland	32
-fixed-rate	32
-candidate	96
-assets	110
-parochial	32
-wealth	32
-accob	8
-firepower	32
-semantics	36
-received	96
-replacing	32
-set	44
-unlawfully	32
-fertilizers	32
-dirty	32
-horseshoe	32
-lk	4
-survivors	32
-harbingers	32
-ascribed	32
-lurk	4
-bally	14
-wholesalers	40
-emily	4
-adieu	32
-damage	36
-clabir	14
-lake-front	32
-together	34
-numerous	34
-populations	32
-merion	8
-consecutive	32
-eisner	12
-finds	36
-cynr	4
-jeanne	4
-house	46
-shoulders	32
-proposal	44
-palmer	4
-patterns	32
-efh	8
-motion	36
-opponent	32
-ideas	36
-soothe	32
-equipment	36
-kebab	32
-export	32
-questions	32
-conflicting	32
-southern	46
-asset-	32
-angiographics	32
-jacob	4
-gary	14
-avondale	2
-infections	32
-citations	32
-systematic	32
-virus	40
-comply	32
-harmful	32
-forstmann	4
-foray	32
-micro	4
-brashness	32
-skew	32
-pan	14
-pants	32
-malongo	4
-radical	32
-repairmen	32
-clock	32
-fault	32
-persio	4
-protecting	32
-mechanically	32
-austria	4
-angolans	4
-league	36
-preservers	32
-misapplied	32
-remind	32
-participating	34
-checkoff	4
-gencorp	14
-pacific	14
-merge	32
-low-priced	32
-denominated	32
-data-base	32
-subordinated	32
-crises	32
-disadvantaged	32
-bibles	4
-uglich	4
-circles	32
-single-family	34
-emigrated	32
-nuclear	108
-propped	32
-circled	32
-brethren	32
-critically	32
-footwear	32
-subordinates	32
-disgraceful	32
-mpi	4
-knitting	36
-rinehart	4
-media	36
-ponnelle	12
-prevalent	32
-victim	32
-helps	32
-foreigners	38
-rowan	4
-unveils	12
-formations	32
-underweighted	32
-denounced	32
-performer	32
-vrdolyak	12
-spill	36
-craziness	32
-bout	32
-claimant	32
-briefly	32
-prompting	32
-stalls	8
-undercutting	32
-portfolio	32
-aspects	34
-karr	8
-seven-member	32
-wrongdoing	32
-dublin	12
-wall-to-wall	32
-brought	36
-extremist	32
-plainclothes	32
-innovate	32
-offerings	36
-subsidizing	32
-chevy	4
-priority	32
-schulz-koehn	4
-pegasus	14
-discharge	32
-engaged	32
-swapped	96
-r.h.	6
-nora	4
-marpac	4
-nameplate	32
-substitution	32
-hertz	6
-seconds	32
-wesley	6
-dogs	32
-vacancy	96
-contender	32
-crooks	12
-outfitted	32
-inextricably	32
-eroding	32
-came	40
-editor	36
-comanche	4
-carbide	4
-nervous	32
-quick-reaction	32
-followerfish	32
-craft	32
-harrisburg	8
-amid	110
-contended	32
-calculators	32
-upset	32
-camp	36
-giants	36
-toilets	32
-simultaneous	32
-reinforcing	32
-diana	4
-cost	32
-suspending	32
-boca	12
-stone	46
-facilitating	32
-electrocardiogram	32
-pacman	4
-fast	44
-doublespeak	32
-contrasted	32
-methods	36
-proved	32
-two-story	32
-lets	32
-nicaragua	6
-debtor-country	32
-lifeless	32
-baryshnikov	4
-chilton	12
-pervasive	32
-elbit	4
-grenada	4
-co-managed	32
-permits	32
-investigating	32
-port	32
-suter	12
-erdman	4
-leominster	4
-owns	96
-jules	4
-prankster	32
-looked	32
-saudi	4
-endeavors	32
-co-manager	32
-invade	32
-avoided	36
-requirements	32
-alnt	8
-bnk	4
-emi	4
-install	32
-tancredo	4
-manne	4
-randy	4
-emerge	36
-delinquent	4
-disappearance	32
-suburb	32
-deficit	44
-mimicking	32
-sutezo	4
-shafer	8
-retton	4
-willcox	8
-you-know-what	32
-turbo	32
-broadcaster	32
-lincoln	14
-recall	36
-anthony	6
-cleveland-based	4
-extended-stay	32
-supplemented	32
-patient	32
-violets	32
-unusually	32
-kelley	4
-koester	8
-assembling	32
-contrasts	32
-conglomerateur	32
-central	44
-scholarship	32
-obscene	32
-erratic	32
-pistols	32
-conveyed	32
-cpq	12
-extends	44
-dresser	4
-dresses	32
-cpt	6
-unload	32
-hci	8
-louis	14
-jerusalem	4
-miny	8
-low-income	32
-perquisites	32
-dressed	32
-revolutionary	32
-huffy	32
-signals	32
-rationed	32
-duplicate	32
-hike	32
-cph	4
-mountain	36
-carpenter	12
-genetski	12
-bradt	8
-arco	14
-inadequately	32
-averaging	32
-attribute	32
-confounded	32
-liebowitz	4
-employs	32
-brady	4
-intensely	32
-sorbonne	4
-buyer	34
-airplane	32
-lexington	8
-strenuous	32
-guyon	4
-rationale	32
-holders	102
-volz	8
-investors	46
-primary	36
-penobscot	12
-photographically	32
-advertising	110
-micron	4
-euan	8
-settled	32
-wine	100
-wind	36
-wing	36
-primark	12
-hardening	32
-parlow	12
-wins	36
-group	46
-argonaut	4
-publish	32
-international	110
-healey	4
-monopole	32
-newsprint	36
-turnaround	36
-cadre	32
-retail-entertainment	32
-broderbund	4
-double-a/a-1-plus	32
-mend	32
-peers	36
-beckett	4
-ventilation	32
-indianapolis-based	4
-mscc	4
-georgetown	4
-waters	32
-short-maturity	32
-forbidden	32
-menu	32
-unlawful	32
-monopoly	32
-news	38
-nios	4
-sailors	32
-stolen	32
-differently	32
-nickname	32
-maquilas	42
-overseeing	32
-represent	32
-confirmation	40
-survivor	32
-jewelry	32
-elects	4
-steel-fabrication	32
-households	96
-squawk	32
-brent	6
-reacting	32
-long-term	34
-combipress	6
-denison	14
-resulted	32
-jurists	32
-fraction	32
-puzzles	32
-feistritzer	4
-union	46
-revved	32
-otherwise-flattering	32
-cornetti	4
-farmington	4
-schooling	32
-machine	36
-abdul-haadee	4
-puzzled	32
-mine	32
-vague	32
-scared	36
-abbe	4
-iowa-based	4
-lakes	32
-olenick	4
-seemala	6
-repaid	36
-deductions	32
-bottler	32
-bottles	32
-airways	4
-spur-of-the-moment	32
-employ	32
-entrepreneurial	32
-ciba-geigy	4
-jean	6
-repair	36
-screamed	32
-cigarette	36
-well-connected	32
-dahl	4
-fiancee	32
-valuable	32
-company-owned	32
-wealthier	32
-bridges	4
-keenan	12
-otis	4
-collaborated	32
-air-freight	64
-drill-bit	32
-ruffle-front	32
-condominium	32
-meets	32
-recordings	32
-den.a	8
-sacrifice	4
-editors	32
-duck	32
-nev.-based	4
-ribavirin	32
-respect	32
-hairs	32
-evokes	32
-brentwood	4
-herald	4
-gathered	32
-adults	32
-networking	36
-independence	32
-clifton	2
-differences	34
-isolated	36
-plunge	36
-money-transfer	32
-scaled-back	32
-southeast	14
-frequent	34
-ahonoora	4
-wheeler-dealers	2
-spirited	32
-chosen	32
-scares	32
-linda	12
-serviceable	32
-somebody	36
-metropolitan	44
-twin-engine	32
-immediate	32
-ads	36
-swamping	32
-ida	4
-approaches	32
-consultants	36
-unfazed	32
-deaths	32
-bikinis	32
-wriv-tv	2
-eons	32
-loucks	8
-everything	36
-recreating	32
-vice	108
-straw	32
-silk	32
-vulnerable	32
-ex-cia	32
-locations	32
-resins	32
-looms	36
-financement	4
-smashing	32
-unravel	32
-cary	4
-math	32
-casseb	4
-black-market	32
-excessively	32
-herculean	32
-care	100
-card	38
-trucking	32
-antidote	32
-alarming	32
-shortages	32
-carl	6
-semiconductor	110
-higher-than-anticipated	32
-method	32
-planners	36
-involved	36
-moines	4
-sprinkles	32
-declared	96
-controller	32
-prevents	32
-drescher	4
-u.s.-dominated	4
-pastures	4
-muster	32
-psychologically	32
-bylaws	32
-jacksonville	4
-limiteds	32
-superiority	96
-sprinkled	32
-low-budget	32
-departure	36
-vaginal	32
-compunction	32
-ethical	32
-muted	32
-protected	32
-projection	32
-steelmakers	32
-spanier	8
-squirms	32
-contain	32
-overturn	32
-pore	32
-correspondent	32
-josephthal	4
-median	32
-slapping	32
-stevenson	4
-attributes	32
-sleeves	32
-guts	32
-smelters	32
-disappearing	32
-linked	32
-realigned	32
-di	44
-newsstands	32
-endo	8
-northrop	14
-extreme	32
-phillips	14
-sort	32
-steppes	8
-steeper	32
-affirmed	32
-whereabouts	32
-foster	44
-invite	32
-decker	4
-sport-fishing	32
-bland	32
-john	14
-parents	38
-hazlitt	4
-guardian	32
-lackluster	32
-enjoying	32
-do	38
-recast	32
-dividing	32
-notorious	32
-transfering	32
-lebow	4
-preparing	32
-entered	32
-stabilization	36
-arrived	32
-straws	32
-marty	6
-ottone	4
-commendable	32
-auschwitz	4
-marts	32
-lining	32
-batter	8
-giudici	4
-practical-joking	32
-honduras	4
-copycat	4
-ability	100
-autonomous	32
-upgrade	36
-de	100
-inouye	4
-shipper	32
-post-big	32
-thursday	6
-institutions	100
-multiply	32
-objected	32
-guano	32
-hodgepodge	32
-settle	44
-wayne	8
-inception	32
-unclear	32
-high-energy	32
-printable	32
-sweetner	8
-shield	36
-front-runner	32
-gregory	12
-traders	46
-maj.	4
-riddled	32
-usw	4
-high-flier	32
-bethesda	4
-job-training	32
-parnes	4
-sias	8
-sociological	32
-deprive	32
-metro	14
-vitro	2
-riddles	32
-bouquet	32
-underwriters	38
-apparent	32
-voters	34
-jerome	4
-subcontractors	32
-margo	2
-supplanting	32
-overcome	32
-sighting	32
-well-known	32
-cnvx	4
-wdc	8
-description	32
-george	14
-t.npi	4
-left-wingers	32
-uep	8
-jaded	32
-athens	12
-fenner	4
-forties	32
-fortier	4
-fulfilled	32
-tie	32
-obligatory	32
-well-versed	32
-unaware	32
-extravagance	32
-holdings	44
-skill	32
-sales	110
-new$	4
-barnett	12
-maryinsky	4
-prospective	32
-bankrupt	32
-bah	2
-mingle	32
-inflated	32
-prosecute	32
-literally	32
-use	36
-subsidy	32
-exactly	32
-symptom	32
-withheld	32
-richer	36
-d.h.	4
-motown	4
-calendar	36
-millennia	32
-india-rubber	4
-supersaver	32
-victor	14
-refinances	32
-now-defunct	32
-cautious	40
-enforcement	36
-subside	32
-fearon	4
-consumer-goods	2
-instances	32
-midlands	4
-skipped	32
-bill	44
-pinto	4
-tastes	36
-suggestions	32
-hard-charging	32
-elegant	32
-homework	32
-goldhammer	4
-cornerstone	32
-taylor	12
-millar	4
-sup	4
-fianna	4
-suggestion	32
-thousand	32
-exchange	102
-nippon	14
-larger	32
-noncompetitive	32
-recognizance	32
-star-topped	32
-satisfying	32
-repay	32
-almost-certain	32
-industry-financed	32
-reconsideration	32
-stray	32
-pork	38
-meg	4
-sokolow	12
-middleman	32
-deregulation	36
-barometer	32
-marquee	32
-waived	32
-flatly	32
-both	46
-pfe	12
-iverson	4
-bipartisan	34
-epidemiology	32
-dmbk	8
-humphreys	4
-offering	100
-article	32
-higher-than-expected	32
-fodder	32
-source	42
-shot	32
-expense	32
-gatsby	4
-ill-advised	32
-bottomed	32
-jean-louis	4
-punished	32
-wacky	32
-ferry	32
-skilled	32
-ries	4
-stock-cash	32
-five-year	32
-illustration	32
-carry-forward	32
-resigned	108
-adopted	40
-betty-jo	2
-bomb	32
-object	32
-guero	32
-user-charge	32
-burgee	4
-six-cylinder	32
-kinburn	4
-backing	32
-combustion	32
-perfect	32
-vanderbilt	12
-maloof	4
-saturn	6
-commanders	32
-renunciation	32
-sasso	4
-ill-designed	32
-bauer	12
-hug	40
-buildings	36
-hud	2
-administering	32
-bunches	32
-goodwin	4
-entrenching	32
-thorough	32
-iscs	4
-wsj	4
-gerhard	4
-hut	4
-unfortunate	32
-hawkeye	14
-passion	32
-shp	8
-announcements	32
-shadowy	32
-non-discretionary	32
-currently	42
-co-production	32
-only	38
-initially	96
-exposed	32
-ado	32
-wharton	14
-telecasts	32
-tropical	32
-bottoms	32
-bank-secrecy	32
-weights	32
-linerboard	38
-mills	36
-bls	4
-pierce	44
-resume	36
-recollection	32
-consolidation	32
-loosens	32
-commonplace	32
-supports	32
-mckinley	8
-pause	32
-cited	36
-futility	32
-robin	4
-cardin	4
-changed	36
-chieftains	32
-high-end	32
-m&a	4
-beaubien	4
-bendel	4
-stifles	32
-alabama	4
-cites	96
-elliott	4
-massacres	32
-similarities	32
-bombarded	32
-admittance	32
-leader	36
-utah-based	4
-donning	32
-secretaries	32
-taiwan	4
-nationality	32
-spreadsheet	32
-transitional	32
-realists	32
-guidance	32
-blessed	32
-demure	32
-dismissing	36
-stunning	32
-entrepreneur	32
-cheerful	32
-arab	4
-work-places	32
-thievery	32
-complication	32
-cry	36
-reckoned	32
-complicating	32
-coasts	32
-sobbing	32
-expressionist	4
-asking	32
-residence	4
-peregrine	32
-premeditation	32
-mighty	32
-tones	32
-sylvania	4
-shimon	4
-frail	32
-lks	8
-resourceful	32
-switched	32
-mysteriously	32
-toned	32
-rts	4
-mightn	32
-hose	32
-sensors	32
-three-ring	32
-sustaining	32
-zoltan	4
-lanes	32
-resentment	32
-utilities	46
-adv	8
-overaddiction	32
-tinier	32
-slower	32
-forcing	32
-along	42
-alone	36
-rousseau	4
-spilled	32
-miserable	32
-gaffe	32
-slowed	32
-peugeot	8
-newsletter	32
-schumer	4
-swooping	32
-wasn	44
-occidential	8
-employment	36
-stifling	32
-career	36
-afghanistan	4
-canada	14
-movie-studio	32
-dropout	32
-s.p.a.	4
-secretly	32
-may	110
-nunn	12
-enjoyed	32
-seeped	32
-shops	36
-frazier	4
-brookfield	8
-doing	40
-exe	4
-moffett	4
-norsk	14
-observance	32
-units	102
-transporation	4
-kisses	32
-hinduism	4
-gaughan	8
-blumenthal	4
-excesses	32
-involves	32
-jumblatt	4
-sounding	32
-local	38
-distanced	32
-royster	12
-u.s.-trained	4
-river	4
-pains	32
-demonic	32
-oxford	4
-glory	32
-unilever	4
-paint	32
-saharan	4
-bartels	4
-sufficiently	32
-databases	32
-child-bearing	32
-acknowledgement	32
-richest	32
-connotation	32
-cuff	12
-geochemistry	32
-skidded	32
-brotherhood	4
-informant	32
-recorders	32
-cheating	96
-supervision	32
-spreckels	4
-rebs	4
-tables	32
-retired	34
-carolee	2
-roth	12
-seemann	8
-kyushu	4
-suffered	32
-falcons	32
-ga.	4
-mirth	32
-dearborn	6
-crowne	4
-warsaw	4
-dana	4
-all-irish	32
-classicist	32
-becor	12
-deposited	32
-trickling	32
-brand-name	32
-galen	4
-orate	32
-prospering	32
-defraud	32
-includes	36
-paribas	14
-visitors	34
-armed	36
-halcyon	32
-orders	36
-special-interest-group	32
-beane	12
-shakeout	32
-premises	32
-bernie	4
-tubing	32
-double-a-minus	32
-confined	32
-burden	32
-test	36
-loosely	32
-chapstick	4
-withholding	32
-pays	32
-stylistic	32
-louisiana-pacific	14
-warp	32
-enlist	32
-fa-18	4
-ballpark	32
-arguments	32
-averted	32
-stainless	32
-shipyard	36
-interruptions	32
-grossed	32
-abt	14
-lenders	32
-misappropriation	32
-charlie	4
-gingerbread	32
-bbec	8
-abc	14
-sickness	32
-aba	12
-sahara	4
-resultant	32
-abf	8
-oxy	8
-tinkering	32
-soliciting	32
-thrust	32
-eligible	32
-commodore	4
-innovating	32
-bbc	4
-braverman	12
-avez	32
-feels	32
-bbk	4
-jumps	32
-veiled	32
-chimpanzees	32
-medieval	32
-jour	32
-legal-ethics	32
-martini	4
-combat	32
-leak	32
-training	36
-lybrand	4
-coney	4
-outsells	32
-refer	32
-rang	32
-welcome	32
-glass-steagall	4
-turns	40
-impede	32
-offense	32
-successful	32
-unisys	4
-supplemental	32
-urge	36
-state-owned	32
-kinda	4
-gang	32
-wilmington	12
-safeway	4
-unwelcome	32
-abner	4
-good	38
-crosby	12
-lafarge	14
-weeklong	32
-nights	32
-intimacy	32
-airliners	32
-blackboard	32
-kansas	4
-rubens	4
-contribution	32
-then-secret	32
-older	32
-granite	32
-high	46
-policy-setting	32
-ballets	34
-entiche	4
-well-orchestrated	32
-day-to-day	32
-humanity	32
-buford	8
-thunberg	4
-bathers	32
-babson	8
-day-care	34
-dutch	4
-inquiries	32
-encased	32
-ought	32
-clashed	32
-druse	4
-automotive	32
-begetters	32
-gaudy	32
-examinations	32
-unreleased	32
-chg.	4
-acquirers	32
-mmblf	8
-assures	32
-lunch	32
-nuclear-strategy	32
-gemayel	4
-assured	32
-long-term-gain	32
-eisenhower	4
-turbine	32
-walsh	4
-goradok	4
-boundaries	32
-such	34
-background	36
-stallion	32
-totalling	32
-feldstein	12
-daredevil	32
-pauls	4
-matsumura	4
-investing	32
-define	32
-phone	32
-ibus	4
-peasant	32
-paula	4
-diversion	32
-closings	32
-presented	32
-amends	32
-gmc	4
-perry	14
-employers	98
-smith	12
-gmh	4
-icn-financed	4
-paulo	4
-port-o-potty	4
-australia-based	8
-unrealistic	32
-strongest	32
-elder	36
-horton	4
-nor	42
-shotguns	32
-adjusting	32
-now	46
-yearn	32
-not	46
-years	102
-upward	32
-hoped	36
-alternate	4
-non	32
-artistic	32
-noc	8
-cooling-off	32
-purchase	46
-births	96
-million-ton	32
-chiefly	32
-teens	32
-terms	42
-portents	32
-amendments	32
-parodies	32
-conjectures	32
-amex	12
-sheathed	32
-partnerships	36
-answers	32
-investment-banking	32
-whereas	34
-rescuing	32
-job-hopping	32
-dimes	32
-straightforward	32
-something	36
-upc	8
-brother-in-law	32
-blank	4
-surrendered	32
-chances	32
-minzer	12
-clarified	32
-feetlebaum	4
-puusepp	12
-historian	34
-moguls	32
-timothy	4
-guilt	32
-fighting	38
-complementing	32
-high-performance	32
-peek	32
-decentralized	32
-ends	36
-django	4
-ratified	32
-behest	32
-guild	38
-advocacy	34
-predict	32
-clevepak	4
-paintings	32
-obtains	32
-profusion	32
-breakthroughs	32
-receive	32
-vastly	32
-conducting	32
-resell	32
-sayi	4
-alleghany	6
-kumar	4
-melt	32
-kildare	4
-learning	36
-stuff	36
-manatee	4
-sincerely	32
-segregated	32
-prescribed	32
-kaneb	6
-boone	8
-richardson-vicks	4
-revenues	32
-gilbert	12
-leasing	36
-strengthening	32
-feinstein	4
-decision	100
-shed	32
-aircraft-evacuation	32
-z	100
-j.c.	6
-priscilla	4
-beijing	4
-long	46
-pratically	32
-lone	4
-beans	32
-trimming	32
-manners	32
-steadily	32
-diagram	32
-netted	32
-command	36
-disks	32
-steeply	32
-yr.	4
-more-extensive	32
-shareholders	110
-sometimes-flimsy	32
-perrin	4
-extraction	32
-ratajczak	4
-anticipation	32
-clans	32
-valencia	4
-notwithstanding	32
-whittier	4
-disclosures	32
-notation	32
-early-deployment	32
-belton	4
-succeeded	32
-restored	32
-existence	32
-axles	32
-butyl	32
-labor-management	32
-kendall	8
-claridge	8
-seventeen	2
-host	32
-gubernatorial	32
-incorporates	32
-brewster	12
-libel	32
-augment	32
-born	38
-pastime	32
-traditional	32
-quinlan	8
-bore	32
-leapfrogging	32
-about	46
-fashionable	32
-management-led	32
-demos	8
-schenectady	8
-partially	32
-incorporated	32
-businesses	106
-telematics	4
-carmichael	4
-bowling	32
-dubai	4
-alter	36
-second-quarter	32
-bangemann	12
-homeowner-mortgage	32
-whiteman	4
-prof.	6
-alton	4
-occupancy	32
-vlis	8
-enact	32
-anniversary	32
-manure	32
-hyping	32
-korean	4
-koten	4
-stockman	8
-using	42
-earlier-than-contemplated	32
-seven-up	12
-abc-affiliated	4
-mims	4
-redoglia	4
-hwp	8
-concocted	32
-embassy	46
-awareness	32
-gravity	32
-fairness	32
-rittereiser	12
-mime	32
-engendered	32
-juveniles	32
-wielding	32
-disagrees	32
-chronicle	32
-ones	32
-seasonal-adjustment	32
-breath	32
-cutthroat	32
-iran-contra	4
-widened	40
-occupying	32
-wartenberg	8
-secret	44
-omnibus	32
-fractional	32
-lung	32
-suddenly	34
-ftc	12
-whiskey	32
-zagorsk	4
-burst	32
-copying	32
-cemeteries	32
-horse-breeding	32
-events	38
-accountants	36
-baluchis	4
-reduction	32
-determining	32
-taxis	38
-peroni	12
-wilshe	4
-explanation	32
-spaces	32
-top-level	32
-johnsons	4
-develops	36
-soviet	6
-miner	32
-trans	4
-proofs	32
-lobbying	32
-joint-venture	32
-prepubescent	32
-bounced	32
-stirring	32
-polluting	32
-sooner	32
-rjr	14
-silence	32
-industry	110
-still-centralized	32
-overcrowded	32
-debentures	100
-berisford	4
-william	14
-piloted	32
-panay	4
-superpower	32
-encouragement	32
-yamaichi	2
-impasse	32
-bidding	36
-slate	32
-overwhelmed	34
-frightened	64
-filed	32
-done	32
-chadli	4
-munch	32
-binge	32
-debt-laden	32
-sherman	12
-negative	32
-coherent	32
-maximize	32
-peach	36
-pre-big	32
-versus	32
-literature	32
-un-reagan-like	32
-velobind	8
-submitted	32
-vacant	32
-jet	36
-jew	4
-orbanco	14
-statutes	32
-clip	32
-interceptors	32
-rowlands	4
-mcknew	12
-destroyed	32
-management	110
-demonstrating	32
-vu	32
-finicky	32
-hypothetical	32
-master	36
-frustrate	32
-portend	32
-malevolent	32
-huw	8
-vi	4
-davenport	4
-vf	14
-ve	34
-systemhouse	4
-alarm	36
-switzerland-based	4
-throwaways	32
-average	46
-burned	32
-number	32
-resignation	32
-starcraft	4
-hated	32
-harming	32
-shipped	32
-lineup	32
-encourage	32
-robelo	12
-rub	36
-overdue	34
-acquirer	32
-exams	32
-run	36
-apartment	32
-decide	32
-orsini	8
-enrique	4
-c-yields	64
-the	126
-short	36
-natomas	4
-expectations	102
-baker-watchers	4
-should	102
-v.	4
-undermining	32
-offshore	38
-profitability	36
-rigs	32
-shore	32
-shcharansky	4
-gave	32
-pyrotechnics	32
-appointment	32
-skyw	8
-mayors	32
-receivers	32
-conferences	32
-stable	32
-drinkers	32
-ensure	32
-s&w	4
-tracks	32
-ppos	4
-dress	32
-displaced	32
-gospel	32
-overreach	32
-thighbone	32
-oil-drilling	32
-burdick	12
-metals	36
-securities-industry	2
-u-shaped	4
-invitations	32
-evaporating	32
-ucc	8
-wald	4
-rebelling	32
-glucksman	4
-heck	14
-approximate	32
-legitimacy	32
-ucl	4
-eddie	6
-pawing	2
-teeth-gnashing	32
-merck	14
-about-to-boom	32
-barbara	14
-shearson-managed	4
-fighters	32
-ugly	32
-accidently	32
-male-female	32
-hawaiian	12
-methodology	32
-skins	32
-birthplace	32
-reliever	32
-scaling	32
-proliferation	32
-kent	6
-lump	32
-monaghan	4
-discuss	40
-schedule	32
-distribution	36
-wein	4
-connors	4
-pile	36
-hipbone	32
-eight-member	32
-beehives	32
-peerless	4
-wallop	44
-snag	32
-antique	32
-joned	32
-mass.-based	4
-soured	32
-musicians	32
-equivalent	96
-outlast	32
-quandary	32
-jones	12
-anonymity	32
-purporting	32
-tradition	34
-underground	32
-frayed	32
-affordability	32
-nugget	4
-tharp	4
-brutally	32
-zeal	32
-artificial	32
-ballot	32
-laffer	4
-date-	4
-smile	32
-nominating	32
-chester	4
-underclass	32
-pinstripes	32
-sidesteps	32
-predisposed	36
-policy	110
-drinking	32
-classifying	32
-england	4
-provoke	32
-anti-communist	32
-conservatively	32
-motorola	4
-photo	36
-surpassing	32
-unfulfilled	4
-capital-markets	32
-friendship	32
-forfeited	32
-repeal	36
-brant	12
-semiconductor-equipment	32
-dust	36
-fidelity-monarch	4
-bleached	32
-churches	32
-jail	32
-revisit	32
-himont	14
-electricity	36
-clearinghouse	32
-foiled	32
-non-cash	32
-accusing	32
-angeles-based	4
-grounded	32
-retaliated	32
-gymnastics	32
-slippers	32
-qua	32
-sun	46
-marking	32
-shakeup	32
-mechanic	32
-grievances	32
-hwkb	8
-million-plus	32
-abrogate	32
-rests	32
-brunswick	4
-four-year-olds	32
-runyon	4
-surged	40
-jackpot	36
-a.c.	4
-man-hours	34
-shrewsbury	4
-braided	32
-cinderblock	32
-opining	32
-putka	4
-leeson	4
-urstadt	4
-mclean	4
-approximates	32
-mcic	8
-dundee	4
-h.c.	6
-sayao	4
-continually	32
-sued	100
-creams	32
-ashamed	32
-anxious	32
-detonate	32
-mounted	32
-scripts	32
-cvn	8
-sweeten	40
-directed	32
-sweeter	32
-i	14
-fla.-based	4
-discontinued	32
-aren	100
-area	36
-market-sensitive	32
-sidestepped	32
-diem	4
-rotational	32
-volunteers	32
-summer	36
-travels	32
-litigator	32
-regaining	32
-joins	32
-ravaged	32
-ebnc	4
-joint	44
-laugh	32
-handful	32
-industrial	110
-deeply	32
-fears	32
-howe	4
-co-authored	32
-blocks	32
-steenburgen	12
-richards	4
-books/william	4
-ledyard	4
-desktop	32
-invoices	32
-heaped	32
-high-quality	32
-continues	100
-fundamental	32
-term-loan	32
-evaluating	32
-lulling	32
-boxer	36
-yielding	32
-concessionary	32
-schuman	4
-contracting	32
-benefit	32
-clearing	36
-continued	34
-good-looking	32
-contra	4
-nonunionized	32
-lingerie	32
-shots	32
-training-related	32
-playroom	32
-cmd	4
-lehder	4
-beaumont	4
-misstate	32
-fund-raising	32
-triple-damage	32
-janitors	32
-spotted	32
-seven-figure	32
-peer	32
-allowing	34
-additional	110
-seat	44
-wefer	8
-finding	36
-pro-forma	32
-implemented	32
-etn	8
-anti-inflation	32
-etc	32
-disagreed	32
-travenol	4
-seag	8
-budgets	32
-rival	38
-matter	36
-fixed-rated	32
-strange	32
-select	32
-sideways	32
-onions	32
-rhetorical	32
-k.	4
-cooper	14
-vapid	32
-nyta	4
-champions	32
-petroleum	44
-asians	6
-rappaport	8
-outlining	32
-fabricated	32
-journey	32
-naylor	12
-please	32
-excreta	32
-malfunctions	32
-schlesinger	8
-flying	34
-tipping	32
-insistence	32
-vessel	32
-undisputed	32
-damaging	32
-cornball	32
-basically	36
-altos	4
-receipt	32
-gloating	32
-tip	40
-competitiveness	36
-dividend	44
-uncommon	32
-reform	36
-limit	36
-cyanamid	4
-certification	36
-populism	32
-photographic	32
-lower-priced	32
-showcased	32
-sept.	14
-royal	4
-md.-based	4
-fast-paced	32
-refractory	4
-heckuva	32
-study	108
-treatments	32
-hia	12
-time	44
-creating	32
-broadens	32
-rains	32
-chemically	32
-gaps	32
-tellez	4
-somber	32
-military	100
-cooled	32
-grievance	32
-irrelevant	32
-widen	32
-intellectually	32
-lawyer-cpa	32
-tropics	32
-petitions	32
-dry	36
-sheldon	12
-gossip	32
-virazole	4
-lever	32
-hourlong	32
-mousse	32
-voices	4
-addressed	32
-refugees	32
-reprehensible	32
-mild-mannered	32
-matching	32
-tranquil	32
-asia	4
-jackets	32
-single-b-3	32
-obtain	32
-newark	12
-messiest	32
-addresses	32
-voiced	32
-adequacy	32
-vanzetti	4
-june	6
-landfill	32
-affirmative	32
-non-utility	32
-swordfish	32
-approaching	32
-cheetah	4
-sally	6
-anglo-dutch	4
-warned	32
-dale	6
-implied	32
-hospitable	32
-dali	4
-nutley	4
-harlan	12
-importantly	32
-countrywide	8
-subject	38
-scavengers	32
-coaches	32
-single-store	32
-roadblocks	32
-loses	4
-comfort	32
-closes	32
-cake	4
-morning	32
-dalhouse	12
-grove	12
-hats	32
-stephens	4
-receptive	32
-foreigner	32
-responsible	32
-mid-december	32
-prowess	32
-breathing	32
-furniture	100
-screening	32
-hata	4
-sewage-treatment	32
-bring	36
-apocryphally	32
-workload	32
-stockbroker	32
-neon	32
-toddlers	32
-states	46
-nick	4
-tenor	32
-martinet	4
-traitor	32
-inexact	32
-eye-opening	32
-disorder	32
-pro-iran	32
-hills-based	4
-tax-cut	32
-helyar	4
-bassett	4
-behind	34
-uninvited	32
-by-election	32
-outpacing	32
-relegated	32
-naked	36
-saunders	4
-persuaded	32
-slumps	32
-confuse	32
-illusion	32
-tainted	32
-e.h.	8
-floating	32
-portrayed	32
-kemal	4
-hotel	110
-introduced	96
-fractious	32
-capitalize	32
-maquila	34
-tampa	4
-mgm/ua	4
-tournaments	32
-lays	4
-precaution	32
-lists	32
-heublein	4
-induced	32
-rehearing	32
-substance	36
-fry	12
-retrenchment	32
-joking	32
-matthews	12
-fluids	4
-obligator	32
-little-noted	32
-sleepy	32
-kong	4
-nissan	8
-sloppy	32
-leeway	32
-lyons	14
-decision-making	32
-performs	32
-amdec	4
-experienced	32
-gainful	32
-print	36
-fairy	36
-sacrificing	32
-candidates	40
-reinsurance	32
-benefiting	32
-scientists	38
-insurgency	32
-citizenship	32
-shunted	32
-eschewing	32
-cuts	36
-aerospace/technology	32
-tree	36
-gods	32
-plants	36
-maintained	32
-ranks	40
-bracket	32
-complaining	32
-beirut	4
-data-storing	32
-junji	4
-transcripts	32
-depleting	32
-hirshhorn	2
-dillon	4
-journal	36
-rewarded	32
-acquiring	32
-starters	32
-price-depressing	32
-face-saving	32
-belmar	4
-first-year	32
-skull	4
-uneconomic	32
-boise	4
-guittard	4
-puff	32
-generic	38
-mich.-based	4
-maturity	32
-thwart	36
-candor	32
-drill	32
-johnson	14
-milkshakes	32
-huge	32
-bomber	32
-employee-compensation	32
-sink	32
-domestic	42
-sine	32
-sunnyvale	12
-sing	32
-bradley	12
-steaks	32
-fernand	8
-cocktails	4
-english	4
-ramps	32
-blamed	32
-lagging	32
-pitfalls	44
-risk-free	32
-papers	36
-resentful	32
-mirrored	32
-hiccups	32
-aquatic	32
-poison	32
-bound	32
-dreadful	32
-chip	100
-intourist	4
-hennessy	8
-expecting	32
-governs	32
-decorating	32
-wielded	32
-jaguar	4
-prospectus	32
-guarded	32
-obligation	96
-balloting	32
-guilty	44
-orthodox	32
-free-market	36
-silliness	32
-convince	32
-deal	36
-insecurity	32
-arguing	32
-historians	32
-rejuvenate	32
-mitsuko	4
-solomon	12
-mlp	8
-vacation	32
-decatur	2
-shoddy	32
-cordials	32
-deduction	32
-attraction	32
-towbin	12
-uac	4
-causing	32
-engagement	32
-ual	14
-definite	36
-democratic	38
-dominant	32
-uaw	4
-coal-mining	32
-presidents	100
-good-sized	32
-erasable	32
-themes	32
-kristol	12
-head	100
-kastenmeier	12
-davao	4
-girls	32
-pfizer	14
-resting	32
-trademark	32
-coates	8
-fujitsu	6
-laments	32
-kicker	32
-pahn-ish	4
-revelations	32
-pain	32
-frequents	32
-thematic	32
-genital	32
-emmett	8
-castro	4
-seeks	44
-observers	32
-tmc	8
-disenchantment	32
-coated	32
-nerd	32
-pursue	32
-loyalties	32
-american-made	4
-pre-tax	36
-understanding	36
-nero	6
-oil-pipeline	32
-blocking	32
-burlingame	4
-financial	108
-plutonium	32
-research-and-development	32
-tenders	2
-constanza	4
-sprang	32
-dipped	32
-streets	32
-nielsen	14
-pardon	32
-debut	32
-good-faith	32
-charts	32
-rudder	32
-pearce	4
-record-setting	32
-weighs	44
-campbell	6
-hmc	4
-forecasts	32
-locked	32
-paperback	32
-ekman	12
-suspect	32
-heinrich	4
-plaintiff	32
-briefing	32
-snowplows	32
-solidify	32
-vote-getter	32
-leaping	32
-mickey	4
-relaxation	32
-forte	32
-california-based	4
-information-service	32
-interrelated	32
-brains	32
-meanings	4
-decrane	8
-pollard	4
-sanctioned	32
-plotted	32
-steel-fabricating	32
-front-page	32
-jammed	32
-myriad	32
-shirley	4
-mass-production	32
-tattered	32
-tutu	4
-exporters	32
-reflects	32
-stave	32
-hurts	32
-regretted	32
-apathetic	32
-characters	32
-struggled	32
-origins	32
-von	32
-booster	32
-psycho-killer	32
-corresponding	32
-future	36
-intimidates	32
-banker	100
-capriccio	4
-restoring	32
-unidentified	32
-futuro	4
-coopers	12
-shicoff	4
-council	44
-fiduciary	32
-two-foot-long	32
-gazette	4
-therein	32
-impending	32
-withdrew	96
-sitcom	32
-salvador	4
-cooped	32
-bean	4
-lufkin	4
-gyohten	4
-tungstic	32
-beef	32
-pausing	2
-largest-ever	32
-dimon	4
-sporkin	12
-pandair	4
-sumitomo-clan	4
-middle	36
-sample	32
-performing	32
-correspondence	32
-salovaara	4
-capitol	4
-geared	32
-winners	40
-mike	14
-proprietors	32
-evacuation	32
-couponing	32
-waterflood	32
-forays	32
-basketball	38
-rajavi	4
-publicly	32
-kennedy	12
-convex	4
-pullback	32
-showboat	14
-convey	32
-racquets-master	32
-pipelines	4
-firsts	4
-ritterbusch	4
-treatment	32
-blonds	36
-lead	38
-mccullagh	4
-almost	98
-risked	32
-labeled	32
-served	32
-bluechip	32
-mixed	38
-blonde	36
-fairly	32
-bevy	32
-bahia	4
-earliest	32
-bees	32
-nonsense	32
-hme	8
-stay-over	32
-infusions	32
-yaroslavl	4
-martyred	32
-zoning	36
-cursory	32
-chrysler	14
-distinctions	32
-pcar	4
-breeze	4
-kilburn	8
-televisa	4
-orleans-based	4
-checked	32
-hierarchy	32
-msl	6
-repertory	36
-regalia	32
-church	46
-remarkably	32
-illustrates	32
-ixiz	4
-first-quarter	32
-nuts	32
-once-incurable	32
-zwyer	4
-puzzle	32
-virologist	32
-equitypurchase	32
-notable	40
-unless	34
-four-story-tall	32
-fezzes	32
-illustrated	32
-souvenir	32
-regarding	32
-subways	36
-magnetic	32
-chikane	12
-lights	32
-showering	32
-reimpose	36
-lionel	14
-one-upmanship	32
-viral	32
-patchwork	4
-era	32
-upshot	32
-lincoln-mercury	8
-wines	32
-armenian	4
-positions	44
-mcnealy	12
-hostility	32
-morrison-knudsen	8
-banjo	32
-room	36
-rent	32
-roof	32
-mccrk	4
-gluts	32
-walnut-sized	32
-embarrassed	32
-worse	32
-debt-ridden	32
-razor-sharp	32
-balanchine	4
-diagnostic	32
-certfs	8
-nihon	4
-seriously	32
-gotten	32
-rolla	4
-suzanne	4
-introductions	32
-tow	4
-dishwater-dingy	32
-replow	32
-top	38
-rouse	32
-bargain-basement	32
-toy	36
-output	46
-theme-park	32
-uphold	32
-partnership	100
-except	32
-arrows	32
-cartoon	32
-oust	32
-breakdowns	32
-tom	4
-btr	2
-too	38
-ton	32
-tulsa	12
-televised	32
-societies	32
-rex	4
-canon	14
-promotions	34
-spread	32
-minicomputer	32
-non-petroleum	32
-coincide	32
-lemgruber	12
-soared	36
-indictments	32
-leaflets	32
-e.m.	4
-theaters	32
-edwin	6
-limiting	32
-orlando	12
-tampering	32
-creatures	32
-carceres	4
-tertiary	32
-induces	32
-procurement	36
-amortizing	32
-haven	36
-honorable	32
-turned	32
-beard	32
-beare	8
-mtbe	4
-additionally	2
-turner	14
-unlimited	32
-reform-minded	32
-pre-aids	32
-grossly	32
-conceivable	32
-drebsky	4
-worst	32
-claptrap	32
-dausch	8
-bears	36
-corrections	12
-chasing	4
-romualdez	4
-suna	8
-difficult-to-collect	32
-channels	32
-claims	36
-sung	32
-suny	4
-asks	52
-foresee	32
-jacket	32
-ropk	4
-fearing	32
-farmer-borrower	32
-meditation	32
-sunw	4
-chautauqua	4
-gimmickry	32
-lying	32
-attempt	36
-extend	32
-dart	36
-age-old	32
-strips	32
-round	40
-schmedel	8
-extent	32
-confusion	36
-mobex	12
-jolla	4
-attested	32
-beer	100
-leaf	32
-abandoned	96
-calvin	4
-scaled	32
-hart	2
-allegheny	14
-farris	12
-retaliate	32
-weeping	36
-outrage	32
-toys	36
-colonialists	32
-cite	32
-after-tax	32
-paul	14
-oversize	32
-installation	32
-hard	36
-intimate	36
-embodying	32
-brill	4
-gloss	32
-harm	32
-intimidation	32
-confirms	32
-lent	32
-laundry	32
-easley	4
-calm	32
-recoupment	32
-arrington	12
-censure	34
-otc	12
-cx	4
-exact	32
-premiere	32
-rocked	32
-mail-order	32
-giraffe	32
-shadow	32
-delight	32
-tie-ins	32
-high-class	4
-government-appointed	32
-abraham	14
-cohen	12
-tarnow	2
-four-week	32
-specify	32
-gangster-filled	32
-ethic	32
-expulsion	32
-earlier	106
-novices	32
-baird	12
-inert	32
-delivery	32
-a.d.	4
-overlap	32
-delivers	32
-infusion	36
-hooking	32
-uncompleted	32
-cherbourg	4
-birthday	36
-revealed	32
-ko	4
-laser-related	32
-philadelphia	12
-sinking	32
-inch	32
-strippers	2
-compelling	32
-participant	32
-asthma	36
-jumped	96
-discharged	32
-snap	32
-alfb	4
-rob	32
-mckeon	8
-advantage	32
-berendt	12
-disappeared	32
-judgment	44
-worsening	104
-stalled	32
-assails	4
-filing	38
-cyclosporine-a	32
-selikoff	12
-affiliates-led	4
-rehash	32
-stepchild	32
-mob	36
-vanunu	12
-anti-ballet	32
-america	12
-spreads	36
-borscht	4
-witness	40
-giant	32
-keeps	32
-globe-trotting	32
-barge	32
-vigorously	32
-token	32
-preferential	32
-festival	36
-imprisonment	32
-chv	12
-fixings	32
-albright	4
-lugar	4
-shugart	4
-cruikshank	4
-operator	32
-three-point	32
-cutbacks	36
-manage	32
-woman	36
-spot-market	32
-recuse	32
-non-competitive	32
-churchillian	4
-safer	32
-laboratories	36
-rents	32
-unending	32
-mozart	4
-contradicted	32
-unhealthy	32
-assault	32
-crisp	32
-pasts	32
-medication	32
-petty	32
-skunk	32
-swaggering	32
-beige	32
-barrett	12
-drink	32
-rebate	32
-fulfilling	32
-williams	12
-gasoline-tax	32
-prejudices	32
-unhappily	32
-dimwitted	32
-float	32
-oceanographic	32
-overstate	36
-charlene	2
-democracies	32
-brain	32
-seelig	12
-insurgent	32
-convene	32
-undeniably	32
-enough	32
-willoughby	4
-putting	32
-psychiatrist	36
-aberrational	32
-data-networking	32
-douglas	12
-cigna	14
-sexual	32
-mon	14
-god	36
-aggregates	32
-dpac	8
-fazio	4
-characteristic	32
-falter	32
-tristesse	4
-acknowledged	36
-phase	32
-arrive	32
-cautions	32
-gop	4
-fredric	6
-realizing	32
-got	100
-covers	32
-ladies	4
-altitudes	32
-abides	32
-toronto-dominion	10
-image-tarnishing	32
-variant	32
-retard	32
-own	36
-thrive	32
-toying	4
-corrected	32
-acknowledge	32
-evenhanded	32
-shiite	6
-lou	4
-lot	32
-low	46
-facade	32
-los	46
-quarter-to-quarter	32
-chilling	32
-marvelous	32
-dresher	12
-acli	14
-bly	12
-citibank	4
-homes	100
-mounting	32
-winans	4
-centers	36
-glugging	32
-resulting	32
-inferior	32
-producers	102
-axxx	4
-startups	32
-sequent	4
-heavily	32
-viratek	4
-janney	4
-complied	32
-priorities	32
-growth-oriented	32
-empowered	32
-manifested	32
-misapplying	32
-locally	32
-distant	32
-panel	36
-episode	32
-admits	32
-wedding	32
-henri	4
-lines-western	4
-mack	12
-shouting	34
-disease	36
-rationalize	32
-laurence	6
-wrapped	32
-watching	32
-pedersen	8
-postponement	32
-narrows	36
-bely	4
-nmk	8
-growing	32
-actor	36
-naming	36
-figures	42
-cyrus	4
-enterprise	36
-freya	6
-greed	36
-purportedly	32
-needed	32
-nash	4
-europeans	4
-anti-ballistic	4
-linking	32
-ghosts	32
-anti-germ	36
-mot	4
-idomeneo	4
-fluke	32
-good-neighbor	32
-enforce	32
-custom-made	32
-supplied	32
-market-research	32
-cft	4
-expressive	32
-wyo	4
-state	110
-marion	4
-commentaries	32
-all-night	32
-glance	36
-cleared	36
-black-white	32
-supplies	36
-supplier	32
-birch	8
-patsy	32
-long-troubled	32
-fanatic	32
-additives	32
-rough-and-tumble	32
-horsepower	32
-dbase	32
-bonn	14
-god-fearing	4
-administration	38
-plc	4
-bond	42
-bone	32
-denis	8
-intermodal	4
-bona	32
-uprisings	32
-cachet	32
-incurred	32
-all-suite	32
-chevalier	36
-robertson	14
-kinds	32
-surprising	32
-membership	96
-shareholder	32
-olcp	8
-mishandled	32
-dicn	8
-trattori	4
-spokeswoman	32
-dollar-holdings	32
-schemes	32
-powerful	32
-exemptions	32
-picking	32
-chip-consuming	32
-withdraw	32
-pepsi	4
-design	36
-reins	32
-favorably	32
-bousquet-cadillac	4
-marrel	4
-office-space	32
-troubles	32
-nivard-flornoy	4
-racquets	34
-hills	36
-wrigley	4
-okay	32
-hsc	8
-automatic-teller	32
-coal-related	2
-soft-drink	96
-favorable	32
-bookshelf	4
-moines-based	4
-preference	32
-professor	32
-companywide	32
-lieutenant	32
-high-technology	32
-recess	32
-insolvent	32
-lpx	8
-sifting	32
-client	32
-miracle	32
-tendered	32
-tele-communications	14
-igc	12
-holt	8
-shindell	12
-restructure	32
-entitle	32
-agreements	32
-sailing	32
-imnx	8
-tournament	32
-u.s.a.	4
-differs	32
-gillespie	12
-j	14
-retained	32
-wright	4
-responsibility	36
-prosecuting	32
-words	32
-script	32
-remnant	32
-amiable	32
-machine-style	32
-ruined	32
-tank	32
-vanity	4
-euromarket	4
-kolb	8
-chaparral	4
-account	36
-attired	32
-dating	32
-explosions	32
-phases	32
-courage	32
-folk	32
-swap	44
-japanese-made	4
-retiring	32
-spectra-physics	14
-aligned	32
-aftermath	32
-sway	32
-michigan	14
-faith	32
-spokesman	96
-ounces	32
-perturbations	32
-burning	32
-tefra	4
-data-reading	32
-dozens	34
-chronologies	32
-york-based	4
-u.s.-japan	4
-deteriorates	32
-cease-fire	32
-sluggishly	32
-ire	32
-financal	32
-appliances	36
-satellite	36
-solis	4
-excursions	4
-financial-district	32
-goldfeder	12
-porsche	2
-chengbei	4
-jim	6
-fireworks	2
-texas-based	4
-pelf	32
-solid	32
-unsavory-looking	32
-carolinians	2
-explaining	32
-rome	12
-maggie	4
-heights	36
-servicer	32
-scary	32
-adolfo	6
-absolute	36
-claiming	96
-coordinator	32
-prophecy	4
-monolithic	4
-payroll	32
-civil-rights	32
-heed	32
-classrooms	32
-brezhnev	4
-scare	32
-solidarity	4
-contradiction	32
-reviewed	32
-irs	12
-struggle	36
-punta	14
-opec	4
-compelled	32
-miyazawa	4
-opel	6
-open	36
-backless	32
-rostov	4
-airlift	36
-games	100
-domination	64
-impediment	32
-poor-quality	32
-sneaking	32
-sessions	32
-delegations	32
-confides	32
-atoned	32
-pryor	4
-overcharged	32
-unprecedented	32
-particle	32
-kits	32
-edits	32
-daunting	32
-mendoza	4
-distance	32
-majestic	34
-overcharges	32
-likable	32
-wiped	32
-unterberg	4
-cast	32
-trucks	36
-broaden	32
-segue	32
-cleveland	12
-pricing	36
-rochelle	4
-saia	12
-said	126
-broader	34
-affecting	32
-hearings	32
-setup	32
-shaped	32
-planting	32
-operation	36
-lighter	32
-bournonville	4
-sinatra	4
-casy	8
-bitter	32
-combing	32
-potentially	32
-stations	100
-deeper	32
-percentage	34
-greeks	4
-arrogant	32
-stimulate	32
-steinberg	4
-blinder	14
-diversification	32
-lescaze	4
-cihlar	4
-anti-hypertensive	32
-kram	4
-guy	32
-broker	40
-libor	4
-stark	4
-articulate	32
-kalamazoo	8
-buses	32
-prevention	36
-moroccans	4
-microsemi	2
-treasurys	10
-bold	32
-money-losing	32
-gauge	32
-emerald	4
-landfills	32
-start	36
-bused	32
-unproved	32
-stars	36
-broken	32
-bearable	32
-anew	32
-followed	96
-carried	32
-hurting	32
-facts	36
-pepsico	14
-poems	32
-case	36
-pushing	42
-wonderfully	32
-sues	68
-repealing	32
-isolation	32
-ethylene	32
-zealand-based	4
-dimmed	32
-facto	32
-fete	32
-bragged	32
-agsi	8
-defer	32
-walt	12
-scrambled	32
-pins	4
-include	32
-haunted	32
-predator	32
-output-based	32
-baltimore	12
-pine	4
-navistar	4
-world	46
-scotch	4
-well-advised	32
-positively	32
-defying	32
-bade	32
-employed	32
-employee	32
-revised	36
-herrera	8
-stephen	14
-discontinue	32
-arie	4
-arid	32
-aria	32
-forbids	32
-psychology	32
-romans	4
-employer	34
-stance	36
-monthly	32
-stanch	32
-munich	4
-killian	8
-maps	36
-uncertainties	32
-divertimento	4
-ran	96
-shelling	32
-gin	32
-idol	32
-oil-services	32
-limousine	4
-decadent	32
-lambro	12
-hour	32
-noxious	32
-door-to-door	36
-gypsies	32
-lynford	12
-underdog	32
-jams	32
-processors	32
-bankshares	4
-matsushita	4
-berkeley	4
-detriment	32
-implicating	32
-adjustable-rate	32
-more-favorable	32
-padding	32
-lantern-lit	32
-rockingham	4
-continual	32
-compared	42
-inside-trading	8
-oklahoma	12
-broadcasters	38
-sponges	32
-families	32
-merchants	36
-lymph	32
-section	32
-compares	32
-nevada	12
-sng	8
-oil-service	32
-qatar	4
-world-class	32
-guidelines	32
-blue-ribbon	32
-bechar	4
-lease-purchase	32
-retroactive	32
-storybook	32
-weekends	32
-adjourned	32
-shields	12
-levy	44
-scapegoat	32
-plodding	32
-falsity	32
-automatic	36
-taxpayer	32
-uspci	14
-outlets	32
-unavoidable	32
-cjn	4
-bloodless	32
-milllion	32
-seem	32
-seen	36
-hidden	36
-interfere	32
-administrator	32
-seed	32
-crashed	32
-leukemia	4
-misdeeds	32
-indispensable	32
-thwarts	32
-birnbaum	8
-sees	100
-clerical	32
-aloft	32
-catherine	4
-now-legendary	32
-bolstered	32
-intend	32
-york-new	4
-columbus	12
-fibrous	32
-long-running	32
-lammermoor	4
-two-pronged	32
-intent	32
-w.c.	4
-phones	32
-caucasus	4
-regards	32
-brad	4
-cost-reduction	32
-mikhail	4
-steal	32
-oechslin	12
-ibew	4
-bandag	2
-boosts	44
-buffer	32
-cantor	4
-animal	32
-monitors	32
-auto-parts	32
-canton	4
-sanity	32
-vaal	4
-whoever	2
-written	34
-stormy	36
-reconstituted	32
-scullion	12
-fraternity	32
-skids	32
-chip-industry	32
-elsewhere	32
-backers	32
-civilians	32
-allentown	4
-epa	4
-harold	12
-rabbi	32
-rocket	32
-foursquare	32
-hi-bred	4
-appreciated	32
-snapping	32
-epi	8
-colors	32
-pellet	32
-distinction	32
-proposals	38
-doubt	32
-end-use	32
-kid	32
-asia/	4
-fourth-largest	32
-groove	32
-bidders	34
-overaged	32
-cutback	32
-reliable	34
-abiding	32
-suggesting	32
-embracing	32
-columns	32
-opossms	4
-audacious	32
-marsam	8
-deals	36
-jackson	12
-denials	32
-diners	36
-endgame	32
-alley	36
-smugglers	32
-diagnose	32
-hitters	32
-modifier	32
-available	32
-reliably	32
-anchors	32
-jefferson	8
-matt	4
-students	46
-dan	14
-shape-memory	32
-dal	4
-plight	32
-inevitable	32
-playoffs	32
-liquid-yield	32
-mate	36
-day	36
-nemesis	32
-hindered	32
-alfonso	4
-medical-insurance	32
-off-brand	32
-continuing	34
-alfonse	2
-licensing	32
-forever	32
-spices	32
-blue-collar	32
-atoka	4
-spreading	32
-dumping	34
-persuading	32
-waynesboro	4
-faculty	32
-pasztor	4
-subversion	32
-niggers	32
-cried	32
-chronic	32
-waved	32
-fliers	34
-yawn	32
-understood	32
-barksdale	4
-workers	102
-city-sponsored	32
-hamadei	4
-donned	32
-dealing	32
-provoked	32
-corporate-bureaucracy	32
-recalculating	32
-london	14
-chemical-analysis	32
-fanny	4
-englewood	12
-advancement	32
-passes	32
-donald	6
-await	36
-scurrying	32
-charge-offs	64
-volkswagen	14
-violently	32
-suppliers	96
-bkn	8
-court-appointed	32
-prices	102
-bisporus	32
-commerce	36
-militia	32
-trigger	32
-olivetti-made	4
-creditor	40
-sands	4
-warmth	32
-breach	32
-falls	36
-curling	32
-natural	44
-knowlton	12
-markets	110
-wide	32
-aids-information	4
-quebecor	6
-prohibits	32
-distorts	32
-inherent	32
-task	40
-incarnated	32
-announce	32
-surcharge	32
-graham	14
-violations	32
-kawanishi	4
-responded	32
-johnny	4
-alexandra	4
-unlv	6
-propaganda	32
-dark-haired	36
-bandwagon	32
-storage	32
-akshay	2
-hes	8
-hafez	4
-gatt	4
-sale-leasebacks	32
-ky.	4
-pressure	100
-rereading	8
-chevron	14
-prohibitions	32
-broker-dealer	36
-ex-cell-o	4
-hypertension	32
-virazole-against-aids	4
-colombian	4
-sacrifices	32
-neighboring	32
-reporterof	4
-arts	36
-emigres	32
-n.c.	4
-politeness	32
-varsity	32
-csfb	4
-preserved	32
-macroeconomic	32
-capitalization	32
-treatable	32
-unions	34
-symphony	4
-ill-informed	32
-realigns	8
-technique	32
-ancient	32
-joke	32
-painless	32
-fermenta	14
-error	36
-pincham	4
-shabby	32
-touting	32
-worsens	32
-statements	32
-ship	36
-cumbersome	32
-hilford	4
-ditches	32
-audio-video	34
-cheek	12
-trim	32
-stored	32
-cameramen	32
-wojciech	8
-trip	36
-inappropriately	32
-lowest	36
-leap	32
-e.l.	4
-hong	12
-extension	36
-reuben	2
-medicare	4
-pakistan	6
-fatal	32
-restless	32
-injured	32
-bantam	4
-integration	32
-lawrenceville	8
-talc	32
-temporary	32
-missing	40
-stamina	32
-talk	38
-reflected	32
-awry	32
-tall	32
-kathleen	4
-asthmatic	32
-saturday	4
-indefinitely	32
-indiana	4
-measurement	32
-when-issued	32
-petro-canada	4
-certificate	32
-rowdiness	32
-breaking	34
-technology	110
-reregulation	32
-bettner	4
-glacial	32
-searched	32
-mercoil	4
-peres	12
-brenton	4
-textbook	32
-grossman	12
-irreverence	32
-interpreters	32
-triboro	4
-scrutinizing	32
-d-rams	4
-munin	8
-pushed	96
-correctional	6
-minorities	32
-wagon	32
-abolished	32
-offer	44
-slowing	32
-joe	14
-colts	4
-admissions	32
-stanger	8
-effect	36
-symbolic	32
-croons	32
-wanting	32
-inducing	32
-mentioned	32
-spun	96
-predated	32
-commanding	32
-honestly	32
-plausibly	32
-merger-defense	32
-virginia	12
-jump	36
-quoting	32
-disturbing	32
-no-win	32
-crime	36
-whisper	32
-caesarean-section	4
-malaysia	4
-va.	4
-floor	32
-important	32
-control	44
-nobel	4
-tre	4
-elephants	32
-screenplay	32
-trn	4
-tro	4
-rafael	4
-unmanageable	32
-translate	32
-wash.	4
-debt-to-capital	32
-trw	4
-flood	32
-atorino	4
-tariffs	36
-try	42
-borger	4
-overseas	102
-steer	32
-steep	32
-personal-injury	32
-reversing	32
-samples	32
-bureaucratic	32
-brahms	4
-astronautics	4
-dual-purpose	32
-baylor	4
-equicor	2
-curfew	96
-heroine	32
-republic	46
-immunology	36
-jeyaretnam	12
-redwood	4
-steel	46
-finish	32
-lab	32
-auctioned	32
-hair-trigger	32
-ledgard	4
-b.v.	4
-quarter	100
-parties	36
-exclude	32
-nutrasweet	14
-itself	36
-p&l	4
-liberals	32
-las	14
-originated	64
-tehran	6
-law	108
-tremolite	32
-lau	4
-solicitousness	32
-topics	32
-lay	36
-blinders	32
-mugged	4
-deserves	32
-coach	36
-mechanics	32
-inconsistent	32
-goodyear	14
-outperforming	32
-cafe	36
-interpretive	32
-spot	36
-materials	100
-minister	36
-jubilant	32
-personal-care	32
-regard	32
-chrysanthemum	32
-goodwill	32
-aspiration	32
-usda	4
-instinctively	32
-hillsborough	4
-anti-takeover	44
-hanson	8
-blistering	32
-extended	100
-presides	32
-process	32
-ancestors	32
-voted	36
-amfac	6
-october	4
-complete	32
-distributed	32
-evolutionary	32
-fresher	32
-transferable	32
-delfim	4
-votes	32
-nestle	2
-flight	32
-presided	32
-thurs	10
-viewed	36
-zweig	8
-close-out	32
-shakespeare	4
-column	32
-weary-looking	32
-complained	32
-factor	32
-outweigh	32
-buffeted	32
-slithering	32
-pollution	36
-fisheries	32
-much-needed	32
-cumulative	32
-rose-colored	32
-thrusts	32
-eaux	4
-rhr	8
-impairing	2
-smart	36
-giorgio	4
-aer	4
-aes	12
-r-word	4
-permanently	32
-latin	6
-governor	32
-cable-television	32
-resplendent	32
-twentieth	4
-assessments	32
-ragbag	32
-hispanics	4
-monitoring	32
-accreditation	32
-macdonald	12
-suspension	32
-assailed	32
-observation	32
-roller-coaster	32
-unintended	32
-affected	96
-grants	32
-deprosse	4
-iraqi	6
-aegis	4
-shortened	32
-wingding	32
-harsco	12
-hullabaloo	32
-crying	34
-data-communications	32
-mobil	4
-instant-camera	32
-egg	32
-auroras	4
-vcc.a	4
-worrell	12
-coldwell	4
-transmutes	32
-obstacles	36
-rebuild	32
-jan.	14
-mutual	46
-political-science	32
-broader-based	32
-comsat	8
-dickson	4
-triple-b	32
-setback	40
-flick	4
-period-vaulting	32
-rebuilt	32
-buybacks	8
-store	36
-maneuvering	32
-ala.	4
-plots	32
-unattractive	96
-science	36
-sound	36
-digital	36
-capsule	32
-greensburg	4
-overload	32
-overall	34
-curbs	36
-excuses	32
-depository	32
-virgin	4
-racing	32
-olivetti	12
-walkout	32
-brake	32
-cruzado	36
-nasdaq	4
-servicing	32
-barely	32
-mainframe	32
-dull	36
-swarthy	32
-excused	32
-excited	40
-trap	32
-winter	46
-growers	32
-alas	2
-completing	32
-export-guarantee	32
-immunity	32
-disarmed	32
-nixon	4
-proclaiming	32
-t	12
-jane	4
-warehouse	32
-sharply	34
-splits	32
-sideline	32
-technologies	44
-expects	108
-tate	4
-bureaucracies	32
-reverence	32
-waller	4
-launching	32
-black-lung	32
-advocated	32
-gather	32
-punch	4
-dawn	32
-apprehension	32
-leases	32
-belong	32
-fewer	34
-reps.	6
-nonbank	32
-vacated	32
-potty	4
-tagamet	6
-allies	96
-accurately	32
-tempestuous	32
-unsuccessfully	32
-undergone	32
-strangers	32
-height	32
-boyd	12
-fiscal	44
-lbr	4
-gideon	4
-election	36
-delay	36
-deliver	32
-filings	36
-alums	32
-self-defeating	32
-entertainment	36
-bottle	32
-maseng	4
-hillsman	4
-arbitragers	34
-hugely	32
-declining	36
-synthetic	32
-classicism	32
-preceding	32
-informal	32
-tourist	32
-trips	32
-piecuch	8
-a-2	4
-occur	32
-a-1	4
-mcclelland	4
-eighth	4
-ritter	4
-thimbleful	32
-activism	32
-dennis	6
-resemble	32
-credits	32
-a-%	32
-demento	8
-strauss	4
-termination	32
-terrorists	32
-raider	32
-rejection	32
-repressed	32
-crownx	4
-conceptually	32
-contracts	36
-observe	32
-risen	36
-strategist	32
-photographer	32
-fiscal-year	34
-rodents	4
-video-disk	32
-renomination	36
-feelings	36
-pre-reform	32
-prints	36
-wnews	12
-osaka	4
-transferred	32
-turning	32
-bible	4
-wives	32
-undersecretary	32
-restoration	32
-promoted	32
-outlaw	32
-feel	36
-bombing	32
-roanoke	12
-choices	32
-lovett	8
-schoolmasters	32
-newcomers	32
-oil-field	32
-clandestine	32
-addict	32
-with	110
-existed	32
-audrey	4
-interpretation	32
-unqualified	32
-incidents	32
-jayne	2
-portasol	4
-late-night	32
-bite	32
-mice	36
-there	46
-museum	36
-equities	36
-weighing	32
-nhy	8
-concord	4
-kogod	4
-bits	32
-benedetti	4
-blacklisted	32
-militants	32
-replica	32
-methodist	4
-excite	32
-declined	104
-diamond	14
-top-of-the-line	32
-oral	32
-allay	32
-storer	4
-stores	36
-declines	36
-smorgasboard	32
-romanticism	32
-whopping	32
-s&l	4
-machines	36
-allan	4
-asher	4
-full-year	38
-undeveloped	32
-statistical	38
-terribly	32
-storyteller	32
-deploying	32
-gci	4
-meet	36
-zipping	32
-gravely	44
-tide	32
-recruits	32
-genetic	36
-powers	32
-dravidians	4
-focused	32
-shlaes	4
-abelson	4
-swathed	32
-hard-pressed	32
-terrible	32
-brigade	32
-ripe	32
-pik	4
-drug	102
-six-week	32
-d.	4
-toured	32
-pie	12
-included	32
-productive	32
-brokerages	36
-intelligent	32
-slower-than-expected	32
-impossible	32
-intervention	32
-fuzzy	32
-johnston	12
-kiwi	4
-servants	32
-small-parcel	32
-pit	32
-cowboys	4
-transformation	32
-ronald	6
-contribute	32
-ample	32
-impaired	32
-stood	32
-portec	8
-ico	4
-kureishi	4
-phoenix	14
-discerning	32
-long-lived	32
-kirby	4
-artists	36
-noise	32
-suspend	96
-gouging	32
-operating	46
-bancshares	4
-hairline	32
-contractionary	32
-amply	32
-marketings	32
-herbert	2
-fatality	32
-tried	96
-dj	4
-mixture	32
-gray	46
-carlos	4
-byswi	4
-modified	32
-interfaith	4
-capable	32
-dd	8
-pole	32
-wallach	4
-gran	4
-astonishes	32
-tries	32
-bonds	110
-pulmonary	32
-patronizing	32
-slow	32
-grab	32
-contractual	32
-slot	8
-exercise	32
-tidy	32
-trash-collection	32
-meleiha	4
-three-lane	32
-monitored	32
-blas	4
-excuse	32
-cushion	32
-unicorp	12
-rogue	32
-pathetic	32
-criminally	32
-norway	14
-lurking	32
-heitzeberg	4
-specialty	96
-adjustment	32
-mission	38
-batterymarch	6
-extrapolation	32
-lara	12
-tempted	32
-deputy	38
-meals	32
-reverse	40
-raises	36
-posner	12
-encounter	32
-curtis	12
-sanctuary	32
-champion	44
-demanded	32
-washingtonians	4
-underwood	12
-perkins	4
-scene	32
-emotional	32
-salesman	32
-angola	4
-antitrust	32
-sears	4
-laughing	32
-interiors	4
-vinnell	4
-givebacks	32
-non-canadians	32
-fading	36
-rounded	32
-newport	4
-invented	32
-diplomatic	32
-rituals	32
-reilly	4
-clda	8
-absenteeism	32
-periodically	32
-herself	32
-works	44
-surface	32
-betsy	4
-realistically	36
-abm	4
-rainier	6
-heavy-truck	32
-raiding	32
-hoop	32
-broadening	32
-engaging	32
-luders	12
-believe	34
-pharmacists	32
-arising	32
-misconduct	32
-unexpectedly	32
-binns	4
-hood	32
-vanilla	32
-darling	32
-metzenbaum	4
-gaynors	4
-going-away	32
-centerior	4
-distributions	32
-disable	32
-acreage	32
-rca	12
-demonstrated	32
-counties	32
-suburbs	32
-sloan-kettering	4
-self-deprecating	32
-escorted	32
-upjohn	14
-ph.d.	4
-progression	32
-recipe	32
-hamburg	4
-painters	32
-vegetables	32
-skittish	32
-altmann	4
-coerce	32
-asbestos-lawsuit	32
-anti-terrorist	32
-ended	36
-license-renewal	32
-obvious	32
-sought-after	32
-information-swapping	32
-commits	32
-beset	36
-chernobyl	4
-chiam	4
-plunder	32
-terzi	12
-hates	32
-ex-chairman	40
-override	32
-third-country	32
-standardization	32
-revamping	40
-georg	4
-suits	38
-expressiveness	32
-mchenry	8
-tuition	32
-bridal	32
-stunned	32
-situations	32
-foxboro	8
-snake	36
-incantations	32
-torrance	4
-term	32
-diarrhea	32
-fisher-price	4
-reciprocity	32
-clayton	4
-sickened	32
-conditional	32
-difficulty	32
-youngest	32
-manipulative	32
-clamor	32
-opposition	38
-nutty	32
-biography	32
-anaylsts	32
-operations	102
-conventional	32
-spc	4
-torture	32
-black-cowboy	32
-duplicating	32
-sodium	32
-overhang	32
-soft	38
-occurs	32
-soviet-bloc	4
-jobs	36
-harshly	32
-instead	106
-susceptible	32
-authority	36
-spy	36
-loss-ridden	32
-reseve	4
-ataturk	4
-amortization	32
-globo	4
-bloc	36
-afforded	32
-nicolas	6
-propelled	32
-marching	32
-inside	34
-knocking	32
-meticulously	32
-tax-increase	32
-skies	32
-wall	46
-incorrectly	32
-incarcerated	32
-homosexuality	32
-truman	2
-buoyant	32
-consortium	100
-propeller	32
-over-represented	32
-tamanrasset	8
-provides	32
-illusions	32
-aghast	32
-luff	32
-demjanjuk	6
-deteriorating	32
-debates	32
-bernado	4
-class-action	32
-vision	32
-provided	96
-rozelle	12
-tears	36
-fredericks	8
-mystifies	32
-depriving	32
-superlatives	32
-madison	4
-architecture	32
-weyerhaeuser	12
-exploit	32
-ct.	4
-approves	44
-parksigns	4
-drives	36
-invoking	32
-maclaine	4
-hogwash	32
-t.cdc	8
-air-conditioner	32
-spruce-pine-fir	32
-newest	32
-convenience	32
-approved	36
-aspidistra	32
-signature	36
-six-month	32
-cos.	4
-valet	32
-idaho	4
-mesopotamian	4
-mca	6
-formation	32
-excess	34
-courier	32
-pearl	4
-outdoor	32
-drawing	32
-libyan	4
-intended	100
-goes	32
-nyse	4
-confront	32
-writers	32
-satin	32
-dump	32
-zodiac	4
-scramble	4
-diminishing	32
-plain-talking	32
-wearying	2
-director	102
-crew	32
-gandhi	4
-forward-looking	32
-mushrooms	36
-w-4s	4
-inventory	42
-researchers	42
-enthusiasm	36
-instructed	32
-cref	14
-counterlogic	32
-youngsters	32
-pectorals	32
-memoirs	4
-fsia	4
-reception	32
-oil-well	32
-ranges	32
-gng	4
-flies	32
-example	32
-bombshell	4
-telectronics	6
-bullishness	32
-keenly	32
-centered	32
-ranged	32
-security	46
-acquired	100
-spurred	32
-dardi	4
-responding	34
-tightrope	32
-stan	4
-cmb	8
-bennett	8
-eliminating	32
-stay	36
-specials	32
-bowie	4
-satans	4
-wellsford	4
-disposes	32
-envy	32
-star	36
-props	32
-wgr-am	4
-centuries	32
-workplace	32
-facing	32
-bankruptcy	44
-generous	32
-erode	32
-priori	32
-stallions	32
-birdwell	12
-figured	32
-special	46
-feelingly	32
-grades	36
-specialize	32
-payload	32
-strain	36
-repeating	32
-tickers	32
-stage-dominating	32
-unspectacularly	32
-prd	8
-subservient	32
-sigma	4
-self-imposed	32
-nail-painting	32
-legendary	32
-hoods	32
-concentrate	32
-caspar	4
-finanziaria	8
-somewhere	44
-exceeds	32
-economical	32
-grieveson	4
-mckay	12
-vehicle	32
-donors	38
-refaat	4
-earthly	32
-surveys	34
-canceled	32
-devastatingly	32
-forging	32
-hardship	32
-turkey	38
-fnd	4
-benjamin	4
-chiba	4
-zim	8
-reassigned	32
-iranian	4
-chocolatier	4
-mesmerized	32
-carberry	4
-antigovernment	32
-charnel	32
-cede	36
-dallas	14
-praising	32
-sr.	4
-acquisitive	32
-fragile	32
-crippling	32
-potato	32
-oft-stated	32
-minolta	4
-mcguire	4
-disrupted	32
-g-5	4
-well-qualified	32
-drug-enforcement	32
-prisoners	32
-exchanging	32
-undervalued	32
-rogers	12
-gerson	12
-toning	32
-material-handling	32
-groundwork	36
-edwards	12
-deepen	32
-bankcard	4
-stockbrokerage	32
-understand	32
-francaise	8
-fetch	96
-vietnam	6
-a.a.	4
-low-end	32
-bulky	32
-breadth	32
-srv	8
-sri	4
-harrah	12
-relaxed	32
-nightwatch	4
-crimps	32
-kemp	4
-venture	100
-smell	32
-recruitment	32
-dirac	8
-low-status	32
-custody	32
-shoulder	32
-purpose	36
-terry	4
-precise	40
-manitoba	4
-smooth	32
-fell	100
-harcourt	4
-accessories	32
-gestures	32
-supply-and-demand	32
-tanner	4
-mg.	32
-lobster	32
-felt	32
-today	46
-felv	4
-taras	4
-protege	32
-warnings	32
-company-operated	32
-evening	36
-her	34
-availability	32
-lime	36
-hew	4
-granted	38
-chocolat	4
-pours	32
-nfl	12
-dictionary	32
-hey	36
-hex	12
-concurrently	2
-motoi	2
-hiccup	32
-ischemia	32
-utah	14
-comfortable	32
-hepatitis	32
-detail	32
-evidenced	32
-rosalie	6
-auction	36
-rage	32
-length	32
-hefty	32
-slowness	32
-acquires	44
-promises	32
-bugs	8
-wards	32
-shapes	32
-grenfell	4
-animosity	32
-bobby	12
-report	44
-boat-and-motor	32
-serve	32
-relocating	34
-last-minute	32
-oil-producing	32
-unity	32
-clashes	32
-budgetary	32
-ignited	32
-dependents	32
-caves	32
-indigent	36
-possibilities	32
-borrowed	36
-irregularities	32
-kazakhstan	4
-piecemeal	32
-unfolding	32
-tijuana-san	4
-co-workers	32
-holding-company	32
-pollution-control	32
-rules-enforcing	32
-latchkey	32
-general-interest	32
-nazaire	8
-magnet	32
-marched	32
-perk	32
-retaining	32
-turmoil	32
-roads	32
-two-day	32
-gm	14
-at&t	14
-go	36
-budapest	8
-novelty	32
-gd	4
-ge	14
-andersen	4
-violating	32
-petroles	4
-flew	36
-actuarial	32
-fabian	8
-occasional	32
-poll	32
-gy	8
-dwight	8
-gt	12
-medical-technology	32
-icc	4
-gq	4
-gr	4
-assuage	32
-rembleske	8
-koenig	4
-breed	32
-documentaries	32
-lows	32
-protectionist	32
-now-notorious	32
-celebrating	32
-robberies	32
-refineries	32
-all-time	32
-allegis	4
-planes	32
-decried	32
-jwt	14
-branded	32
-ba-2	4
-planet	32
-forson	8
-prosecutors	32
-beatrice	4
-jugglers	32
-champs	32
-run-up	32
-raiser	32
-links	32
-adapts	4
-drawn-out	32
-confirm	32
-wrecked	32
-torto	4
-electronic	38
-printers	32
-ease	36
-visibly	32
-welex	4
-happened	32
-easy	36
-visible	32
-neglecting	32
-raised	108
-desperately	32
-guess	32
-adversely	32
-east	46
-deferral	32
-globe	36
-carteret	6
-bid-rigging	32
-j.k.	4
-jett	12
-pure	32
-improvise	32
-internal	46
-pura	4
-jets	32
-thereafter	34
-unremitting	32
-paulus	8
-leagues	32
-hazard	32
-low-	32
-faint	32
-mcfarlane	14
-student	36
-means	36
-afoul	32
-garden	44
-miami-based	4
-ram	8
-drawn	32
-rao	8
-meant	36
-de-americanized	32
-consequence	32
-mcginley	4
-ray	4
-elements	32
-danish	14
-seated	32
-nonprofessional	32
-capita	32
-battered	32
-yevgeny	4
-anti-jazz	8
-agencies	32
-raw	32
-experts	34
-takeover-oriented	32
-fictionalized	32
-hailed	32
-nothing	40
-but	126
-bookstore	32
-fweets	32
-nli	8
-continent-wide	32
-five-week	32
-between	34
-zy	4
-painted	32
-mathewson	4
-precipitate	32
-calculations	32
-addington	2
-racial	32
-reich	12
-stockholdings	32
-bus	32
-dumagami	10
-taught	32
-wishful	32
-bumpy	40
-payouts	36
-chocolate-coated	32
-fingerprinted	32
-thwarting	32
-de-germanize	32
-adopting	32
-brannigan	4
-witless	32
-hoak	8
-plunking	32
-modernizes	32
-competitors	34
-fenders	32
-debenture	32
-eight	34
-defendant	32
-knack	32
-spike	14
-pre-graham	32
-mcdonnell	6
-mousetraps	32
-mismanaged	32
-makers	110
-gramm-rudman	4
-pertained	32
-bram	2
-scandals	44
-stock-swap	32
-yell	36
-chamorro	12
-s.c.-based	4
-casinos	44
-representatives	32
-impractical	32
-foreclose	32
-frank	14
-tyger	4
-reporter	36
-grocery	32
-stripes	4
-impulses	32
-frisked	32
-until	38
-lotus	14
-exaggerated	32
-foam	32
-foal	32
-reported	100
-kensington	4
-radiation	32
-structuring	32
-rescheduling	32
-registration	32
-initiating	32
-recognizable	32
-federated	4
-tinker	12
-solicit	4
-eric	4
-kindercare	4
-cushman	4
-voluntarily	32
-marches	32
-willing	32
-defending	32
-libya-style	4
-once-monolithic	32
-broadcast	36
-specious	32
-chandler	8
-forgotten	36
-free-standing	32
-second-ranked	32
-chestnut	32
-visits	32
-advisory	32
-authentic	32
-churns	32
-mood	32
-slowly	40
-insight	32
-truell	4
-string	32
-understandably	32
-associates	38
-solicited	32
-inscrutability	32
-support	116
-dominate	32
-moot	32
-issues	36
-precedes	32
-callous	32
-messel	8
-concrete	32
-understandable	32
-refuge	36
-urban	44
-sell-off	32
-memory	46
-single	32
-either	34
-armon	4
-qualifies	32
-collecting	34
-recommends	32
-hodding	4
-administered	32
-conversions	32
-zeroed	32
-charters	32
-effluent	32
-favorites	32
-netto	4
-cigarettes	36
-tues.	8
-reporters	36
-informally	32
-indicted	44
-obliged	32
-brag	32
-twa	4
-unlike	34
-provisionally	32
-oversold	32
-my	38
-carriers	100
-quadrex	4
-singapore	14
-angrily	32
-taipei	8
-regularly	32
-romeo	4
-owned	32
-atomization	32
-borne	32
-openness	32
-intrauterine	32
-rocks	32
-tahoe	4
-antiprotons	32
-reflecting	40
-oswego	4
-dispense	32
-owner	32
-referred	32
-zimmer	14
-rocky	36
-motivation	32
-year-old	32
-ladder	32
-lear	14
-printer	32
-raise	100
-presentation	32
-excoriating	32
-jed	4
-certified	44
-kudelka	4
-ludwig	4
-nuys	4
-printed	32
-sanguine	32
-beta	32
-aron	10
-manufacturer	32
-manufactures	32
-chocolate-chunk	32
-patent-infringement	32
-franc	32
-microbe	32
-milton	4
-richfield	4
-corrosion	32
-junk-bond	32
-disclose	32
-rohr	14
-pinch	32
-stuck	32
-animal-health	32
-converting	32
-expand	32
-otherwise-anemic	32
-worlds	36
-platoon	4
-more	110
-beneficial	32
-fnbf	8
-auto-buying	32
-adhesives	32
-vignettes	32
-convinced	32
-beats	32
-touchy	32
-definitely	100
-falling	42
-accounts	36
-poorest	32
-david-weill	12
-quondam	32
-siegler	4
-leaning	32
-chefs	32
-inject	32
-category	36
-morris	12
-admitted	32
-white-haired	32
-tweedledee	4
-favors	36
-loosen	32
-maxcell	4
-disability	32
-envoys	32
-attesting	32
-stockbrokers	32
-stares	32
-lasted	32
-toronto	12
-chilly	32
-paine	4
-ardmore	4
-trademarks	32
-hourly	32
-boo-boos	32
-byron	4
-complements	32
-domtar	4
-grounds	32
-stoic	32
-anemia	32
-anemic	32
-empire	32
-credibly	32
-six-year-old	32
-probing	36
-rhodes	4
-meter	32
-otherwise	34
-harney	4
-doulton	4
-supposed	32
-hanisee	12
-loose	32
-disciplined	32
-toxic	32
-pargas	4
-plea	36
-tuxedo	32
-three-year	32
-grimm	12
-actions	32
-special-events	32
-furnished	32
-latch	32
-wfc	8
-discounting	32
-parcel	4
-leftovers	32
-carve	32
-hmo	4
-comparison	32
-furthered	32
-wrko-am	4
-stoltenberg	4
-drugs	32
-majority-owned	32
-sergeants	32
-topic	36
-express	36
-moslem	6
-operators	32
-paraphrase	32
-inner	32
-innes	8
-battle-tested	32
-less-developed	32
-pleased	32
-nominees	32
-occupations	32
-auditorium	36
-bickering	32
-fernley	4
-stock-in-trade	32
-nobodies	32
-reaffirm	32
-millon	4
-splintering	32
-drucker	4
-detected	32
-elaborately	32
-books	44
-completion	34
-catalogued	32
-invests	32
-relays	32
-overpriced	32
-convicted	32
-beseechingly	32
-bukovsky	4
-claremont	12
-belittle	32
-gigolo	4
-asymmetrical	32
-stylish	32
-arkansas	4
-collusion	32
-boller/coates/spadaro	4
-trinity	4
-zoo	32
-optimistic	36
-qualifications	32
-pinpoint	32
-reminds	32
-vendors	32
-dna	4
-transtechnology	14
-sites	36
-in-house	32
-st.	12
-characterize	32
-crisscross	32
-birth	32
-handcuffed	32
-nod	4
-knife	32
-c-123	4
-attacking	32
-prods	32
-sexy	32
-debilitating	32
-speakes	4
-speaker	36
-advancing	32
-lawyer	44
-kilpatrick	8
-carrot	32
-unsung	32
-hoopsters	4
-high-density	32
-public	38
-cedar	8
-communicated	32
-carmine	4
-administrative	32
-idea	32
-overture	36
-apartment-housing	32
-swiss	14
-harrow	4
-indication	32
-ontario	4
-eugene	14
-omit	32
-satiated	32
-bilateral	32
-sto	4
-stk	4
-spite	32
-stockpile	32
-nighttime	32
-beneath	32
-conquered	32
-stb	8
-responsibilities	96
-guaranteeing	32
-glashow	4
-sharks	32
-debt	100
-speeding	32
-hottest	32
-outperformed	32
-bar-hopping	32
-wohl	12
-hardware	36
-glenn	4
-lauding	32
-attrition	32
-software	36
-alert	32
-faction	32
-attached	32
-emr	8
-film	44
-captives	32
-fcc	14
-o-daiko	32
-supporter	32
-sentiments	32
-file	32
-amnesia	32
-slipped	40
-religious	32
-moon	32
-ndco	8
-tax-loss	32
-prejudiced	32
-unselfish	32
-medications	32
-ears	32
-concentrating	32
-professionals	32
-scout	32
-chapters	32
-havens	4
-flavor	32
-concert	36
-heroines	32
-wished	32
-creator	32
-indicate	36
-limited-partnership	32
-edinburgh	4
-improvements	32
-dismissed	32
-run-and-gun	32
-concern	38
-sparse	32
-anti-apartheid	38
-cost-cutting	36
-second-guessing	32
-supported	34
-liven	32
-leveraged-buyout	32
-high-school	32
-lived	32
-market	110
-approval	110
-expiring	32
-intra-european	32
-parallel	32
-inhabitants	32
-district	108
-centocor	4
-brookings	2
-marked	32
-eclipsed	32
-jeopardizing	32
-evaporate	32
-marquette	4
-lives	34
-vegas	4
-death-rate	32
-lambsdorff	4
-gender	32
-sustained	32
-damage-control	32
-tatum	8
-milestone	32
-danny	4
-sever	32
-knight	12
-beauty	36
-optical	4
-seven	34
-cabs	32
-frightening	32
-tenafly	4
-preventable	32
-photography	40
-organisation	4
-crawl	32
-sloan	12
-gina	2
-belched	32
-huyssteen	4
-five-month	32
-chicken-restaurant	32
-passed	32
-bumpers	32
-financial-services	32
-jus	4
-voice	36
-colby	4
-rationalization	32
-first	46
-jug	4
-construction-materials	32
-worn-out	32
-random	36
-whip	32
-kicks	32
-legalization	32
-dispositions	32
-air	46
-semiconductor-industry	32
-better-known	32
-ait	4
-inevitability	32
-compliance	32
-inroads	32
-aim	32
-me	38
-freedman	12
-shirts	32
-freshman	32
-aid	100
-conservatism	32
-visa	6
-hurlburt	4
-excellence	32
-halted	32
-vise	32
-exports	110
-ketchum	12
-automatically	32
-subscribe	32
-thumbed	32
-hyperbole	32
-convert	32
-later	46
-tpa	4
-diane	6
-autonomy	32
-repressive	32
-preoccupation	32
-n.h.	4
-co.	12
-mullally	12
-m.j.	6
-cephalexin	6
-surgeon	32
-lowenthal	12
-trammell	4
-fattening	32
-nyack	4
-reorganizing	32
-judge	46
-friday	14
-lowly	32
-dodge	40
-lounge	32
-auckland	4
-conquest	4
-retailer	96
-activity	36
-investable	32
-clg	8
-adapting	32
-familiarity	32
-meaning	32
-clf	8
-depreciable	32
-bounce	32
-randall	4
-adjuvant	32
-disaster	32
-liberties	32
-waltz	4
-derby	8
-recession	32
-perceive	32
-freight	44
-opening	32
-g-man	4
-tops	32
-legg	4
-con	4
-heinemann	4
-copies	32
-excitement	32
-office-automation	64
-plunked	32
-temptations	32
-euroyen	4
-collaboration	32
-experiments	32
-deferred	32
-front-runners	32
-ma	4
-stop	46
-rarely	34
-copied	32
-cox	4
-coy	4
-charing	4
-awakening	32
-cot	4
-nsany	4
-sweetly	32
-brides	32
-deliberations	32
-wdg	8
-recurrence	32
-quarterly	46
-rigdon	4
-spouses	32
-tactic	32
-tomorrow	34
-clouds	4
-inefficiency	32
-inserts	32
-fanuc	4
-luanda	12
-sachem	4
-cellular	36
-hokum	32
-courtroom	32
-pinks	32
-proportioned	32
-obscures	32
-profit-spreads	32
-disposal	32
-exorbitant	32
-creates	32
-fred	12
-free	44
-placed	36
-sommer	8
-caucused	32
-vann	4
-finished	32
-questionable	32
-while	46
-institution	36
-passions	32
-frew	12
-sense	32
-productivity	32
-federation	4
-relative	32
-petipa	4
-spain	4
-deposit	36
-tex	12
-deservedly	32
-malaysian	12
-claudine	8
-orchestrated	32
-departing	32
-bygone	32
-cruzeiro	32
-enthusiastic	32
-reimburse	32
-rapaciousness	32
-durant	4
-tornto	4
-statute	32
-transition	32
-emil	4
-floating-interest-rate	32
-derivative	32
-casualty	38
-whatever	38
-often	46
-delco	4
-currency-trading	32
-temple	4
-reinstatement	32
-keynesian	4
-world-traded	32
-prevailed	32
-heel	36
-second-highest	32
-healthvest	8
-state-court	32
-cash-flow	32
-estate	36
-contingents	32
-disputes	38
-ferris	12
-guys	36
-vancouver-based	4
-officials	110
-moderate-priced	32
-advise	32
-amended	32
-isc	4
-devour	32
-isn	104
-centralized	32
-glittering	32
-dried	32
-unreliable	32
-cokes	4
-hague	4
-transient	32
-workweeks	32
-morgan	14
-peaks	32
-peripheral	32
-harlem	4
-attempted	32
-resistant	32
-secrecy	32
-dtc	12
-kill	32
-reiterated	32
-common-stock	32
-stabilizing	32
-conner	12
-dealer	36
-competitor	32
-quadrennial	32
-reassess	32
-ensemble	36
-related	36
-marbury	4
-soft-spoken	32
-portugal	4
-home-video	32
-covered	32
-originating	32
-recommendation	32
-persecuting	32
-menlo	4
-unsuccesful	32
-reply	32
-sunni	4
-parachutes	32
-entrepreneurship	32
-tumult	32
-buffalo	4
-borders	32
-survive	36
-rejecting	32
-last-ditch	32
-dummy	32
-ldc	4
-conn.-based	4
-comparing	32
-huntington	6
-very-small-business	32
-members	38
-imelda	4
-clips	32
-accidents	32
-salinas	4
-native	32
-latter	32
-write	36
-seldom	34
-coleman	14
-zagorski	4
-enderlin	4
-staples	32
-am	38
-fabled	32
-couple	32
-klaus	4
-nightly	36
-conrail	14
-dental	32
-definition	32
-magnitude	32
-moment	32
-artistically	32
-schreyer	8
-waterhouse	4
-square	36
-leventhal	4
-fashion	32
-toronto-based	4
-midpriced	32
-compatibility	32
-rohowsky	8
-recede	32
-ken	4
-abolition	32
-threw	32
-bets	32
-earning	34
-thompson	4
-hinder	32
-key	44
-veritable	32
-risks	36
-olof	4
-lasers	32
-three	110
-ddc	4
-risky	36
-bookish	32
-perception	32
-grata	32
-repayment	32
-denigrating	32
-wilcox	4
-michigan-based	4
-pugnacious	32
-non-steel	32
-high-ranking	32
-put	34
-foreseen	32
-borovoy	8
-manufactured	42
-pul	4
-pub	4
-pullman-peabody	12
-ernest	14
-semifinals	32
-interns	32
-bothered	32
-brockman	4
-entry	32
-ba-3	4
-noel	2
-ba-1	4
-under-represented	32
-entries	32
-ventres	8
-numerals	32
-criticism	32
-users	36
-coat	32
-department-related	4
-earned	36
-groves	8
-coal	36
-eight-foot	32
-foskett	4
-selling	42
-repurchased	32
-masters	36
-misstated	32
-kahle	12
-lift	32
-sheridan	4
-modern	38
-ulcer	32
-secured-lease	32
-varied	32
-cripple	32
-supercomputers	32
-refunded	32
-mari	4
-life	46
-barley	32
-reopened	32
-brigades	32
-flirting	32
-wednesday	12
-postponed	100
-complain	32
-hijacking	32
-waving	32
-data-processing	32
-vogel	4
-ontario-based	4
-audience	32
-subordinate	32
-underwrote	32
-backed	32
-nummi	8
-infractions	32
-smuin	12
-gerry	8
-slender	32
-forms	36
-sciences	4
-culture	36
-pm	4
-concept	46
-pn	8
-evans	4
-pk	4
-strengthen	32
-scott	4
-pd	4
-origin	32
-waive	32
-halved	32
-applications	42
-cabbage	4
-booty	32
-reliance	46
-gant	8
-boots	36
-halves	32
-wisconsin	14
-rgs	4
-roxani	4
-troops	32
-builders	44
-projects	32
-maneaty	12
-economic	108
-trager	4
-disbanding	32
-muhammad	12
-curtail	32
-departments	32
-curently	32
-satisfactorily	32
-nicholson	4
-davidweill	8
-inventive	32
-arbor	4
-orthodontic	32
-audiences	32
-induction	32
-manipulation	96
-kimberly	4
-nonwhite	32
-jesus	4
-localized	32
-salomon	14
-valued	100
-criticizes	32
-greece	6
-new-car	32
-minn.	4
-ewall	32
-mortgage-backed	32
-urgency	40
-ceramic	32
-job-conscious	32
-spying	32
-dispel	32
-conversations	32
-values	32
-hindawi	4
-coryphees	32
-resurrecting	32
-hock	4
-sharing	32
-edition	32
-omens	32
-sauce	32
-declarations	32
-lambert	4
-paper	38
-pizza	38
-phenomenon	32
-underdeveloped	2
-inflation-adjusted	32
-lawton	12
-architectural	32
-visions	32
-macdougal	8
-improving	32
-reaches	32
-locating	32
-far-fetched	32
-lucia	4
-finest	32
-reis	12
-sailboat	32
-unfairness	32
-faded	32
-cassoni	4
-upstate	32
-valvano	4
-hyperinflation	32
-refiner	32
-microscope	32
-speakerphone	32
-deceptive	32
-donna	4
-carmen	4
-detection	32
-find	36
-inhaled	32
-commissioner	38
-frears	4
-rush	32
-referral	32
-captors	32
-balancing	32
-public-liaison	32
-fink	12
-pesticides	32
-ex-microsoft	32
-continue	36
-commissioned	32
-baa-1	4
-baa-3	4
-applaud	32
-occasionally	34
-swiss-registered	4
-toppped	4
-atrocious	32
-gael	4
-completely	32
-adler	12
-iceberg	32
-respiratory	36
-reciprocal	32
-arm	104
-directing	34
-supply-siders	34
-arb	32
-bearer	32
-screenings	32
-coast	44
-costa	12
-publicity	96
-banca	4
-prettier	32
-nonperforming	32
-evolve	32
-announcement	32
-months	32
-alchemist	32
-costs	38
-lipton	12
-art	38
-distracted	32
-utx	8
-peaking	32
-exchange-rate	32
-acquaintance	32
-venice	4
-comprehensive	36
-rode	32
-colorado	6
-cut-and-dried	32
-posing	32
-uti	4
-murders	32
-ansa	4
-napery	32
-brandao	4
-rods	32
-ftlv	4
-drop	36
-suspects	32
-portray	32
-sony	4
-expire	32
-rings	32
-ngo	4
-enabled	32
-rosenberg	4
-opponents	34
-mother	32
-wakefield	4
-heavy-duty	32
-biggest-selling	32
-roles	32
-sultry	32
-tiny	32
-weathers	4
-enormously	32
-countermeasures	32
-enables	32
-paid	32
-untutored	32
-directions	32
-explore	32
-imperils	8
-phoebus	4
-refinancing	40
-morristown	4
-kimbriel	8
-closest	32
-already-crowded	32
-marcos	12
-apart	36
-downward	32
-orlov	12
-convocations	32
-burgess	4
-peelers	32
-macabre	32
-substituting	32
-fans	34
-eaten	32
-play	32
-screens	32
-defendants	34
-slip	36
-fe	4
-plas	4
-flower	32
-enormous	32
-higher-ranking	32
-subsidize	32
-pre-strike	32
-slim	32
-plan	44
-luncheon	32
-drum	32
-viability	32
-flowed	32
-ostrow	4
-robbed	32
-slid	32
-attend	32
-ecstatic	32
-morale	36
-wbi	8
-ensconced	32
-overuns	32
-force	36
-ill-fated	32
-factory	32
-wbb	12
-denial	34
-blindly	32
-interrupted	32
-championed	64
-telephone-installation	32
-rallies	32
-grow	36
-dollar-owners	32
-gros	12
-carry	32
-arabs	4
-approach	32
-budget-cutting	32
-canny	40
-mackin	4
-tucker	4
-pate	32
-abela	12
-allegation	32
-associate	40
-factors	38
-gear	36
-arranging	32
-manila	6
-pitching	32
-others	46
-dissolved	32
-permanent	32
-subsistance	32
-lender	36
-specialist	32
-self-defense	32
-unavailability	32
-l.p.	12
-slices	32
-paced	34
-bsc	4
-laborers	32
-grinspun	4
-anti-union	32
-nearby	32
-visited	32
-tort	32
-mifflin	4
-guerrilla	32
-brief	36
-sliced	32
-rescind	32
-rifkin	12
-saturation	32
-crowe	12
-crowd	32
-modeling	32
-relieve	32
-keyed	32
-caa	4
-cab	4
-cal	4
-can	116
-crown	36
-malaise	32
-cai	4
-virtues	32
-kuhn	2
-cat	36
-thoughts	32
-asahi	2
-keyes	12
-cap	36
-promoting	32
-car	106
-galvanize	32
-grandmotherly	32
-carries	32
-diagnosing	32
-reason	36
-spent	32
-magnificent	32
-dutch-auction-rate	4
-taxation	32
-talking	36
-earle	4
-coordination	32
-genteel	32
-wondering	32
-misleading	36
-bowen	12
-fly-drive-sleep	32
-carnegie	4
-robust	40
-inflections	32
-summaries	32
-dissolver	68
-misting	32
-ravitch	4
-waste-disposal	36
-contends	32
-downfall	32
-attractions	4
-brewed	32
-weighed	36
-overwhelming	32
-dinsmore	4
-icbms	4
-self-tender	32
-brazilians	2
-reputation	32
-freedom	36
-gifted	32
-snobbism	32
-thalmann	4
-waterfront	34
-deepest	32
-chevelles	4
-nearest	96
-enabling	34
-heaviest	32
-work-study	32
-embarrassment	32
-sunbeam	14
-flournoy	4
-rolling	36
-poker	32
-guests	34
-rates	38
-firmer	32
-unrequited	32
-fleet	36
-chaplains	32
-girl	34
-u.s.s.r.	4
-compact	32
-subcompact	32
-value-added-tax	32
-reviews	32
-adjacent	32
-weren	32
-enhance	32
-sheet-metal	32
-quips	32
-reinstated	32
-borohydride	32
-underbilling	32
-arena	32
-forces	32
-regain	32
-warranty	36
-confuses	32
-mmm	8
-mush	32
-eke	32
-ekg	4
-must	36
-pre-paid	32
-forced	32
-regular-season	32
-wipe	32
-dialogue	32
-seniors	32
-confused	32
-yen-denominated	34
-buck	4
-remotely	32
-endotronics	14
-displaying	32
-isabella	4
-detailed	32
-militiamen	32
-stock-market	36
-anti-gadhafi	32
-a.m.	32
-pregnant	32
-independents	32
-analyzed	32
-bigness	4
-mixing	4
-buckley	12
-sunglasses	32
-kind	36
-foolish	32
-neuro	4
-edina	8
-projected	32
-museums	32
-easy-cash	2
-uterine	32
-wander	32
-reminded	32
-epistolary	32
-heavier	32
-complaint	36
-staffs	32
-clones	32
-overallotment	32
-hopelessly	32
-failures	36
-clot	36
-circuitry	32
-reminder	32
-poles	36
-leave	36
-nationale	4
-material	36
-radial	32
-bow	32
-stronghold	32
-oval	32
-negotiate	40
-realistic	32
-tudor	4
-videocassette	32
-equicor-equitable	4
-price/earnings	32
-blender	32
-bags	32
-phrases	32
-shanghai	4
-planned	32
-televsion	32
-certainty	32
-responsive	32
-assumption	32
-mild	34
-accelerating	32
-amarillo	12
-lopes	4
-toshiba	6
-exhibited	32
-competitive	32
-jolt	32
-egypt	4
-capping	32
-therapies	32
-bullying	32
-non-farm	2
-movie-maker	4
-published	32
-non-canadian	32
-interests	32
-inc.	12
-acute	32
-logan	4
-edge	32
-cute	32
-apologized	32
-triple-b-plus	32
-precisely	32
-neutrons	32
-spokane	12
-publishes	32
-commenting	32
-imitators	32
-subvert	32
-overextending	32
-grave	32
-comparatively	32
-glimpse	32
-gateway	4
-beers	32
-clearance	36
-surplus	36
-awaits	32
-predicts	32
-mcmurray	4
-notoriously	32
-knock	32
-theft	32
-suicide	32
-word-processing	32
-dayco	4
-weakening	32
-balance	36
-never-never	32
-negotiation	32
-amounted	32
-minus	32
-year-ending	32
-seneker	4
-phi	4
-aspin	12
-tyce	12
-stanley	14
-bidder	32
-planner	4
-md-11	12
-repairing	32
-manufacture	32
-vice-president	32
-heat	36
-prejudice	32
-ancher	12
-consistent	32
-asides	4
-raid	32
-dirk	4
-extra	32
-rain	32
-holocaust	32
-xanax	4
-mer	8
-upheld	32
-mep	4
-casualties	32
-pensacola	4
-balance-of-payments	32
-met	36
-bavarian	4
-shoppes	4
-gives	32
-indecisive	32
-graying	32
-high-bracket	32
-accelerated	32
-jawboning	32
-purchasing	36
-weakened	32
-lunatic	32
-lowers	36
-acknowledges	32
-gregorian	12
-advertised	32
-commerciale	4
-sailor	32
-accelerates	32
-recalled	32
-willliams	4
-medea	4
-mckeown	4
-neighborhood	32
-cabrera	12
-fda	6
-contaminated	32
-allen	12
-token-ring	32
-instructions	32
-sweeping	32
-fdn	4
-involvement	36
-fds	8
-gordon	6
-timing	32
-seeds	32
-becket	8
-myopia	32
-best-selling	32
-cieply	4
-envisions	32
-price-earnings	32
-progressive	36
-penal	32
-insists	32
-translation	32
-waite	6
-saves	32
-saver	32
-interleukin-1	32
-intolerable	32
-reviewing	32
-light-skinned	32
-lubove	12
-rev	4
-rock-bottom	32
-late	46
-worsen	32
-application-specific	32
-matriculated	32
-suitors	32
-refugee	32
-saved	32
-witches	32
-dance	44
-momentary	32
-vladimir	4
-fried	44
-heterodox	32
-persisted	32
-contract	36
-moraine	4
-strollers	32
-hosted	32
-bench	32
-analogues	32
-gibert	4
-goods	100
-villages	32
-besides	42
-investor	46
-proceeds	42
-nonviolent	32
-going	32
-nancy	12
-casts	32
-link	36
-skin	32
-australian	14
-china	46
-recouping	32
-limits	36
-sydney	8
-seven-year	32
-reemerge	32
-jesse	2
-el-sayed	96
-skip	32
-group/business	4
-glen	4
-sewerage	4
-sin	32
-flavors	32
-seesaws	4
-ignored	32
-slide	36
-capitulation	32
-sunworld	14
-ultimate	32
-equality	32
-instruments	36
-sib	4
-mushroom	38
-total	46
-highfliers	32
-opted	32
-hair	32
-six	102
-unauthorized	32
-infringement	32
-sit	32
-oversees	32
-non-precious	4
-kathy	4
-forestall	32
-bosses	32
-soda	32
-bull	36
-trouble-free	32
-strategists	96
-computers/office	32
-bulk	32
-hurl	32
-appetite	36
-embezzlement	32
-scanditoy	4
-order	36
-otto	4
-miguel	4
-high-priced	32
-state-controlled	32
-perle	12
-pills	32
-jpm	4
-advance-purchase	32
-totty	4
-interplay	32
-animators	32
-experiencing	32
-waldholz	4
-desperation	32
-strapped	32
-disgust	32
-fuels	32
-merchant-ivory	4
-groans	32
-mosley	4
-chopped	32
-king	38
-aqua	36
-biological	32
-lesser	34
-dying	32
-certain	44
-religions	32
-hess	4
-reactions	32
-italiana	4
-drexel	14
-scale	32
-turnabout	32
-instincts	32
-enterprises	44
-northwestern	4
-tolerance	32
-waging	32
-buyback	36
-democrat	4
-describe	32
-actively	32
-explains	32
-murder	32
-countervailing-duty	32
-friendly	32
-realms	32
-pepper	12
-articles	32
-diseases	36
-triumphs	4
-pasok	4
-pump	36
-albani	12
-savage-western	4
-planning	36
-haggling	32
-being	34
-necessarily	32
-decree	32
-estimate	36
-formulated	32
-consist	32
-hanif	4
-shuttle	32
-advocate	36
-noble	14
-cancers	32
-revival	36
-quasi-protectionist	32
-rjf	8
-resist	32
-seductive	32
-abused	32
-providers	32
-millie	4
-microsoft	14
-merging	32
-countered	32
-epileptic-like	32
-altruistic	32
-scorns	32
-shambles	32
-comforting	32
-offers	36
-combinations	32
-ellsberg	4
-pqb	4
-page	32
-expansive	32
-shufro	4
-no-nonsense	4
-relax	32
-detector	32
-utilize	32
-lined	32
-yet	34
-nurtures	32
-homeowners	32
-yes	34
-scrutiny	32
-hadn	32
-groundwater	32
-litigants	32
-dealership	32
-relocations	32
-noir	32
-yevtushenko	4
-produce	36
-restructured	32
-triangle	4
-yen	32
-physicians	36
-circumstances	32
-developed	32
-x-there	64
-ticket	32
-potent	32
-fusillade	32
-switchboards	32
-proxmire	12
-bucksbaum	12
-francisco-based	4
-earthquake-free	32
-coughlin	8
-abhor	32
-initiatives	32
-openly	32
-skimmed	32
-displayed	32
-nationalism	32
-children	38
-lower-cost	32
-marketable	32
-historically	34
-glassmakers	32
-full	44
-realignment	32
-director-general	32
-exceeding	32
-grid	32
-applicant	32
-comex	12
-anybody	32
-dissented	32
-widow	4
-adjoins	32
-wholesaler	32
-covering	32
-leigh	12
-poverty	32
-inflammatory	32
-shorts	32
-understands	32
-eduardo	4
-grip	32
-cdii	8
-confirmed	96
-beatrice/hunt	4
-educator	32
-generator	32
-extensive	36
-mid-to-upper	32
-pledge	32
-produced	32
-after	110
-proceed	32
-concerto	4
-pa.-based	4
-having	46
-enemies	32
-emissions	32
-produces	32
-producer	46
-wage	34
-onto	32
-stoke	32
-computer-information	32
-affiliation	32
-bogdanich	4
-vtek	4
-mastered	32
-southbend	6
-fixed-cost	32
-invites	32
-councillors	32
-hyman	8
-removing	32
-fatalistic	32
-dictatorship	32
-flatter	32
-narrower	34
-brands	36
-invited	32
-swb	4
-scathing	32
-brightest	32
-garland	4
-mckinnon	12
-helmut	4
-grungy	32
-johann	4
-esber	12
-emma	4
-pmco	4
-pulp	40
-cords	8
-advisable	32
-georgiadis	4
-emmy	4
-mevacor	4
-carpeting	32
-present	32
-trait	32
-repositories	32
-georgian	4
-gown	32
-connolly	12
-impatient	32
-tax-rate	32
-adjunct	32
-dynamics	4
-expertly	32
-quacks	32
-workplace-related	32
-sentencing	40
-anti-smoking	32
-mediterranean	4
-lifted	32
-chloe	4
-comments	32
-zoot	32
-pockets	32
-wreck	32
-black-owned	32
-fla.	4
-underwriting	32
-lubin	12
-punishments	32
-colgate-palmolive	2
-hints	32
-zone	32
-preferred-stock	32
-sabre	4
-chum	32
-fraught	32
-revoking	32
-stigma	32
-records	44
-targets	32
-truer	40
-changing	36
-breaching	32
-poppea	12
-mergers	36
-brian	14
-mateo	4
-ft.	4
-fundamentalist	32
-exploits	32
-mediator	32
-impunity	32
-ratings	36
-u.k.	12
-suspended	32
-dual-career	32
-fundamentalism	32
-cooperation	100
-weinstein	12
-record-breaking	32
-advocating	32
-montedison	4
-crosses	32
-auctions	38
-compiled	40
-horror	32
-chicken	36
-waldman	4
-congressional	38
-riding	32
-speech	32
-kurt	4
-indebtedness	32
-patti	4
-nucleus	4
-gags	32
-resisted	32
-europ	12
-educated	32
-ceded	32
-vaccine	32
-downplay	32
-paragraphs	32
-saddled	32
-check	32
-wire	36
-brandy	32
-one-sixth	32
-merc	4
-urokinase	32
-apparel	36
-lowered	96
-tango	32
-mere	32
-sweethearts	32
-jacobson	8
-sweeny	4
-inaccurate	32
-watchers	32
-argentine	4
-buss	8
-argentina	4
-lucky	38
-eaton	4
-busy	36
-hartford	12
-articulately	32
-corroboration	32
-withdrawal	44
-commons	4
-modernize	32
-obliterate	32
-poink	4
-condon	12
-hinges	32
-marketeers	32
-wor-fm	4
-co-creator	32
-bush	14
-scientific	36
-dynes	8
-dramatic	32
-two-series	32
-respectful	2
-assaults	32
-currencies	40
-dangers	32
-redemption	44
-ringing	36
-decreased	32
-lund	4
-congregated	32
-valves	32
-alienate	32
-whirring	32
-upgrading	32
-assortment	32
-acumen	32
-gentleman	32
-gallantly	32
-dissidents	32
-medicinal	32
-warranted	32
-awards	32
-assurance	44
-stoga	8
-recently	114
-dismissal	32
-deleting	32
-chagrin	32
-dimetapp	4
-spirit	32
-louisville	4
-mostow	4
-arbitration	32
-raw-steel	2
-newsrooms	32
-roulac	8
-midwest	4
-urs	10
-thomas	14
-crocodile	4
-morrow	4
-tissue	2
-invoked	32
-pantzer	12
-ever-improving	32
-rely	32
-barrels	32
-blacklist	32
-petersburg	8
-teleglobe	6
-mexico	14
-laid	32
-encourages	32
-protect	32
-ousted	32
-dismisses	36
-hustlings	4
-constituent	32
-fill	32
-hollow	32
-speculating	32
-canadians	4
-ouster	32
-chesebrough-pond	12
-nettlesome	32
-ekstrom	8
-obsessional	34
-eagle-picher	10
-underline	32
-convincingly	32
-laughable	32
-accuser	32
-surprisingly	32
-triumph	32
-replaceable	32
-contented	32
-lynn	12
-accomplished	32
-perceiving	32
-accord	36
-vw	8
-nbc	12
-academies	32
-abshire	4
-midsized	32
-prince	32
-handsomely	32
-credited	32
-positioning	32
-estrangement	32
-focus	36
-israeli	6
-leveraged	32
-plea-bargaining	32
-seriousness	32
-lose	32
-brutish	32
-appealing	32
-social-security	32
-short-year	32
-lowell	12
-begins	44
-stressed	32
-layoffs	32
-wash.-based	4
-loss	46
-rummy	32
-lost	36
-qualifying	32
-luckily	8
-mci	14
-lavish	32
-willam	2
-separates	32
-behold	32
-frailty	4
-massenet	6
-broccoli	32
-fbs	8
-occasion	32
-situation	36
-aliens	36
-fbi	4
-separated	32
-physician-researcher	32
-locality	32
-fbc	8
-generations	32
-sweep	32
-invasion	32
-inman	8
-sanders	4
-invaluable	32
-sexiness	32
-ambition	32
-running	44
-below-cost	32
-reasons	36
-though	42
-syrian	4
-s.	14
-belgo-luxembourg	8
-attaining	32
-stumbled	32
-intensify	32
-flair	32
-practitioners	32
-adjustments	32
-half-empty	32
-continuously	32
-appreciation	32
-vaunted	32
-plazas	4
-kreider	4
-politicized	32
-headstrong	32
-trashy	32
-lobbied	32
-strassels	8
-andrei	4
-hopped	32
-choreographer	32
-fish	32
-helped	34
-choppy	32
-spar	4
-borg-warner	14
-sb	4
-conscious	32
-se	32
-rut	4
-non-japanese	32
-participants	34
-cluster	32
-traded	32
-fabricating	32
-sm	8
-single-b	32
-losers	32
-structural	32
-buyouts	32
-st	4
-winnipeg	4
-blacks	32
-hazardous-waste	32
-gasoline	98
-trades	36
-trader	32
-omb	12
-exercisable	32
-enrico	4
-stroh	4
-lgn	4
-mother-in-law	40
-schneider	12
-enrich	32
-soprano	32
-carlson	12
-developers	34
-superstitious	32
-meese	12
-moralizing	32
-hildegard	2
-ashton-tate	6
-petersen	8
-fray	32
-enhancing	32
-balked	32
-n.j.	4
-southfield	8
-insurance	44
-supervisory	32
-hanoi	4
-swear	32
-projecting	32
-observes	32
-observer	36
-wavering	32
-cozied	32
-comanches	4
-bollerer	4
-sadder	2
-c.o.m.b.	6
-billing	32
-victories	32
-lowry	4
-observed	32
-bourj	4
-unresolved	32
-open-market	32
-anti-dumping	42
-stoves	32
-price	110
-collateral	32
-sneaker	32
-moniker	32
-shark	36
-frenzied	32
-blustery	32
-persuasive	32
-bristol-myers	4
-expropriated	32
-spurned	32
-sharp	44
-pranksterism	32
-ethics	36
-contingent	32
-resources	36
-rua	4
-aids	38
-absolutely	32
-smiled	32
-draftsman	32
-recipients	32
-bloom	4
-custom	32
-newsweek	6
-monteith	8
-struts	32
-blood	32
-shifted	32
-solicitor	32
-analysts	46
-scant	32
-lighting	32
-syria	4
-simon	12
-amend	32
-memorandum	32
-jurisdictional	32
-fri.	2
-nevertheless	34
-didn	32
-highest	32
-severe	32
-heal	32
-decisions	32
-full-sized	32
-skyscraper	32
-single-digit	32
-bottlenecks	32
-financed	96
-second	46
-aviv	4
-post-split	32
-guessing	32
-woodcuts	32
-armin	4
-season	32
-stever	4
-ernst	6
-hot-blow	32
-schwarz	4
-light-filled	32
-gained	96
-lead-recycling	32
-three-day	32
-utilization	32
-cloud	32
-gartner	4
-executor	32
-spurt	32
-treadway	4
-affiliates	36
-birmingham	14
-clout	32
-marshal	32
-problem-loan	32
-tvla	8
-chicago-area	4
-defense	108
-implies	32
-indictment	32
-bertrand	4
-critical	36
-salaries	38
-lavender	32
-caliber	32
-bloodstock	32
-buick	8
-somehow	42
-schwartzman	4
-leaking	36
-bigwig	32
-allows	96
-marche	32
-nervousness	34
-tune	32
-arose	32
-inscrutable	32
-wound-healing	32
-imagine	32
-seventh-largest	32
-consumer-group	32
-hal	14
-okla.-based	4
-castings	32
-tails	32
-carrying	32
-developing	32
-digest	4
-convoluted	32
-had	108
-froze	32
-condoms	32
-hay	4
-staggers	32
-insult	32
-frankel	4
-out-of-favor	32
-has	108
-toadstool	32
-hereditary	32
-refused	32
-dishwashers	32
-transamerica	4
-sleazy	32
-speakers	32
-managements	32
-healy	12
-k-12	4
-carroll	8
-staff	46
-telecommunications	46
-failure	36
-supervisors	32
-kit	32
-endangered	32
-flooded	32
-redistribution	32
-attempts	34
-dhl	4
-allegations	34
-strenuously	32
-endanger	32
-halsor	8
-penetrate	32
-re	36
-above-average	32
-resolve	100
-rj	10
-implementing	32
-gilts	32
-kempe	4
-t.j.	2
-newmark	14
-mathematically	32
-jamaica	4
-rt	4
-clutch	32
-weinger	12
-andrews	4
-centrifuge	32
-propping	32
-dosages	32
-doubly	32
-ponce	4
-savin	4
-market-oriented	32
-panicked	2
-lentivirus	4
-would-be	32
-mortages	32
-talent	32
-principals	32
-outlawing	32
-emits	32
-debated	32
-rid	32
-kilmer	4
-variability	32
-technician	32
-walkie-talkies	32
-philosophical	32
-considerably	32
-clerk	32
-charged	36
-imbroglio	4
-voicing	32
-liberate	32
-metall	6
-tighten	32
-winner	32
-one-time	32
-appalachian	4
-consumer-oriented	32
-monday	14
-considerable	32
-telecasting	32
-share-for-share	32
-charges	102
-novel	32
-kfv	4
-needing	32
-maid	32
-pharmaceutical	36
-a.t.	4
-maneuvers	32
-smr	4
-sms	4
-balloons	40
-bypass	32
-posted	100
-vans	36
-marvels	32
-norwegian	6
-return	36
-nonunion	36
-healthy	32
-plaintive	32
-illegitimate	32
-disguised	32
-smf	12
-outstanding	32
-shelves	32
-portfolios	32
-bicentennial	32
-roommate	32
-rapid-fire	32
-chronology	36
-clues	32
-nurse	36
-shulman	12
-valve	32
-disgruntled	32
-quell	32
-flora	2
-gte	14
-radio-controlled	32
-f	12
-shoring	32
-week	38
-chimpanzee	32
-semi-annual	32
-shuffling	32
-formless	32
-daiwa	4
-acqusitions	32
-shelley	4
-intentions	32
-think	32
-ballyhooed	4
-thing	36
-gainers	34
-andreas	4
-unitek	4
-alliance	36
-hesitate	32
-raphael	4
-range	32
-lacks	32
-chain	36
-marilyn	6
-tchaikovsky	4
-disagreements	32
-rebel	36
-colleague	32
-colodny	12
-likud	4
-deliberately	32
-kinetic	32
-mayor	38
-r.j.	2
-faring	32
-convergent	4
-someone	34
-printemps	6
-invitation	32
-latitude	32
-paranoid	32
-cleveland-cliffs	10
-paranoia	32
-network-news	32
-thank	32
-boats	32
-conjures	32
-framingham	4
-arbitrage-trading	32
-dataquest	4
-toyoo	4
-baffled	32
-objection	32
-protective	32
-blackstone	4
-whichever	32
-bartlett	4
-buying	44
-uncharacteristically	32
-gathering	32
-klein	12
-rubber	44
-d-ram	4
-isles	4
-specifications	32
-fellas	32
-depositary	36
-reverted	32
-factual	32
-unguaranteed	2
-curtain	36
-consternation	32
-lately	34
-greenberg	12
-idolized	32
-publicized	32
-a-discounted	64
-dethrone	32
-jasinowski	12
-instrumental	32
-irritations	32
-bowery	4
-delaware	14
-somewhat	32
-infancy	32
-arbiter	4
-maxxum	4
-buoyed	32
-beesley	12
-ottoman	4
-accident	32
-legitimate	36
-haughty	32
-comparable	32
-unveil	32
-lonsdale	4
-mellifluously	32
-industries	108
-designing	32
-liberation	4
-initial	42
-orleans	4
-regulation	38
-allots	32
-ava	4
-howls	32
-surpassed	32
-ventron	4
-informed	34
-anti-ulcer	32
-monday-night	4
-mcgee	12
-slated	44
-legislation	40
-generating	32
-indulgence	32
-thus	42
-location	32
-continuation	32
-interleukin-2	32
-erect	32
-stamp	32
-interview	36
-eastdil	4
-anguished	32
-ups	6
-uniformly	32
-puffy	32
-wanted	32
-furor	32
-wallboard	36
-hughes	14
-hobor	12
-centronics	14
-insurers	36
-female	32
-sniff	32
-door	32
-delano	4
-baroque	4
-workings	32
-kiss	40
-disdaining	32
-upj	8
-spoken	32
-sculpture	4
-unsupported	32
-bulgarians	4
-ensures	32
-jones-irwin	4
-swamped	32
-baptista	12
-galoob	4
-mutuals	32
-new	46
-collapse	36
-investigate	32
-interest	110
-masterpieces	32
-revolve	32
-easier	32
-second-fiddle	32
-pso	8
-paying	36
-conflict-of-interest	32
-spinning	34
-rendering	32
-balloon	32
-psd	8
-complexities	32
-orchard	4
-supporting	32
-coming	46
-oakland	4
-regardless	42
-predicting	32
-revise	32
-lardner	4
-noon	32
-shortly	34
-resigning	32
-auto-company	32
-slumped	32
-big-bucks	32
-salzburg	4
-correspondents	32
-banking	108
-making	44
-caleb	4
-plantations	32
-commensurate	32
-endemic	32
-embroiled	32
-trying	34
-reputed	32
-money-supply	32
-multimillion-dollar	32
-usefulness	32
-colonel	32
-macfadden-bartell	4
-retailers	40
-discovery	32
-inquire	64
-fung	12
-fund	36
-route	36
-kodak	14
-coherence	32
-forge	32
-ameritech	4
-moammar	12
-forgn	12
-flashing	32
-slew	32
-honoring	32
-leaped	32
-jerk	32
-kemmons	6
-ncnb	4
-tastiest	32
-catastrophe	32
-paulette	4
-variety	32
-linchpin	32
-undoubtedly	34
-barclays	4
-tenure	32
-mobilize	4
-anderson	12
-montreal	12
-editor-in-chief	32
-bajarin	12
-flaky	32
-comparative	32
-perils	32
-subcommittees	32
-specializing	32
-price-fixing	32
-c	36
-grabbing	32
-estes	8
-schools	36
-westbound	32
-elizabeth	4
-offensive	32
-fights	36
-double-digit	32
-beni	4
-nine-to-10-month	32
-headed	96
-instability	32
-north	44
-amr	14
-fertilizer	32
-thought	32
-hydro	4
-launched	98
-chaos	32
-amy	4
-amf	4
-gaining	32
-amd	4
-foals	2
-tcf	4
-amc	14
-ama	8
-bakes	4
-amo	12
-launches	36
-societe	4
-swings	36
-fast-food	32
-automation	32
-barrage	32
-inquiry	36
-exemption	32
-puts	32
-hadlock	4
-globalization	32
-seats	46
-takeda	4
-patrons	32
-prepared	32
-explosives	32
-ridgefield	8
-latam	12
-powerhouse	32
-dallas-fort	4
-dorm	6
-vidal	4
-tax-overhaul	32
-dingell	12
-ltd.	4
-octavia	4
-proportional	32
-possibility	32
-profits	38
-customizing	32
-double-a-minus/a-1-plus	32
-gecm	4
-fslic	4
-freely	32
-recreational	32
-alliant	4
-soldiers	32
-adorjan	8
-pole-vaulter	32
-hays	2
-fitzwater	12
-lazarus	4
-long-run	32
-inference	32
-freezeframe	4
-t-lymphotropic	4
-prominently	32
-toll	36
-freeport-mcmoran	4
-infringe	32
-acceptance	32
-conservation	36
-told	36
-record	44
-far-ranging	32
-tagliabue	4
-brick	32
-hanged	32
-accepts	40
-cofide	12
-liquor	34
-barry	12
-videos	32
-spell	32
-telequest	14
-milwaukee	12
-collected	32
-belief	32
-run-down	32
-definitive	34
-cci	12
-keyboards	32
-ambassadors	32
-bank	110
-ccb	12
-ccc	4
-intrigue	32
-cce	12
-evaporation	32
-mining	46
-provinces	32
-politician	32
-mound	32
-billionaire	32
-awake	32
-successor	32
-half-step	32
-duds	32
-identical	32
-waiving	32
-declare	32
-mount	36
-irritants	32
-wales	4
-taste	32
-association	36
-dispute	36
-updated	32
-ticket-purchase	32
-increasing	100
-builds	32
-windhaven	4
-graze	32
-directorial	32
-balls	32
-inexplicably	32
-centuries-old	32
-addicts	32
-inexplicable	32
-sea-land	4
-editorial-page	32
-updates	32
-soar	32
-refinance	32
-soap	32
-symposium	32
-willingness	32
-learn	32
-norm	32
-ever	34
-stately	32
-consequential	32
-editorial	36
-break-up	32
-anticipated	32
-over	110
-peanuts	32
-warns	32
-tasks	36
-adam	4
-remote	32
-technological	34
-adverse	32
-anticipates	32
-statue	32
-paris	12
-nomura	4
-settles	44
-hca	12
-gradison	4
-shindig	32
-erria	4
-third-ranked	32
-edelman	12
-reservations	36
-crude-oil	32
-palestinians	4
-mainstream	32
-biting	32
-thinking	32
-needham	4
-entertained	32
-restrictions	32
-contributions	32
-anaheim	12
-helpful	32
-memo	32
-thunderous	32
-classes	32
-possible	36
-was	108
-farmer-borrowers	32
-roped	32
-postpone	32
-unesco	4
-known	34
-subcabinet-level	32
-counterparts	32
-preserving	32
-basement	32
-pickens	12
-discounts	32
-latent	32
-effort	36
-minneapolis-based	4
-third-highest	32
-s.g.	4
-stardom	32
-possibly	34
-disconnect	32
-quarters	32
-iranian-syrian	4
-founders	32
-deficit-reduction	32
-itek	4
-mobile-home	32
-abalone-processing	32
-nations	36
-item	32
-pete	4
-brusqueness	32
-guerrillas	32
-hiccuped	32
-assertions	34
-barrios	4
-equity	36
-discussions	32
-daylong	32
-u.s.-soviet	4
-mid-40s	32
-minoxidil	32
-devices	36
-overabundance	32
-faulted	32
-hyper-reactivity	32
-jaws	32
-lyle	4
-unsuccessful	32
-waddell	8
-testify	32
-shifting	32
-cooperman	12
-speidell	4
-intention	32
-krug	12
-aln	8
-regarded	32
-audits	36
-glaser	4
-fourth	38
-motel	32
-apollo	4
-oversubscribed	32
-toothpaste	32
-small-appliance	32
-columbia-registered	4
-polypropylene	32
-spree	32
-diverting	32
-e-estimated	64
-pliable	32
-cashing	32
-therapeutic	32
-ldcs	4
-cabinet	32
-flounder	32
-antibody	36
-neo-darwinists	32
-burger	12
-active	32
-knows	36
-might	100
-feb.	14
-safety-related	32
-weighty	32
-implored	32
-instructive	32
-violated	96
-mad	4
-slips	32
-mac	4
-rarity	32
-professing	32
-depositor	32
-deathbed	32
-parasitical	32
-max	12
-germany	12
-measures	32
-ain	32
-hungary	14
-arkin	4
-ridiculous	32
-germans	6
-s.a.	4
-tax	110
-demonstrate	32
-brand	32
-canadian	14
-tar	4
-it	110
-tap	32
-on-site	34
-kidneys	32
-middletown	8
-tan	4
-remember	34
-veress	4
-centigrade	4
-lewis	12
-duration	32
-appeals	44
-triple	32
-segment	32
-laced	32
-menus	32
-photo-processing	32
-difficult	32
-occupy	32
-long-awaited	32
-seamen	100
-inducements	32
-placate	32
-mass-manufactured	2
-degrees	32
-sidestep	32
-assistance	100
-eager	32
-animals	32
-amplifications	4
-incomplete	32
-persuade	32
-doctors	46
-desmond	6
-calgary	12
-immigrant	32
-tucson	4
-reallocation	32
-chairs	32
-securities-law	32
-retail	110
-board	110
-ave.	4
-schumann	4
-short-haul	32
-block	44
-redmond	4
-generated	32
-retain	32
-elma	4
-tommaso	8
-catholic	4
-says	110
-information-driven	32
-calvert	4
-fuzziness	32
-atmosphere	32
-denationalized	32
-resuscitate	32
-satellite-beamed	32
-talked	32
-f.g.	4
-jeans	34
-legends	32
-merit	32
-food-store	32
-raw-material	36
-acquistion	32
-ledger	32
-umm	4
-peace	36
-katz	4
-assembled	32
-six-packs	32
-bach	4
-tariff	32
-preston	12
-ballet	36
-speyer	4
-eurobonds	12
-specifically	42
-kids	32
-prohibition	32
-donations	32
-prior	106
-northwest	4
-fast-moving	32
-soon	98
-per-capita	10
-represented	32
-back	46
-pertinent	32
-monsanto	6
-labib	4
-preparation	32
-sandra	4
-long-form	32
-forest-products	32
-jose	4
-lancet	4
-rehnquist	4
-rubin	4
-converted	32
-foot	32
-ducks	32
-easton	4
-capital-spending	32
-wallow	32
-bedside	32
-dumez	6
-dilemma	36
-shortage	36
-meddlesome	32
-overhead	32
-heated	32
-stock	110
-afraid	32
-slacks	32
-six-figure	32
-wheelchair	32
-outskirts	32
-clouded	32
-pay-fone	14
-fanatics	32
-softdrink	32
-clev	4
-tasted	32
-savage	44
-capital-gains	32
-regulates	32
-boxes	32
-highways	32
-office-temporary	32
-bing	4
-objectionable	32
-hospital	46
-kaufman	14
-drawings	32
-nordmann	12
-outbursts	32
-self-consciously	32
-top-secret	32
-hastings	2
-deterioration	32
-regulated	32
-status	32
-conspicuous	32
-again	38
-oct.	6
-justices	40
-virtually	32
-high-limit	32
-tractors	32
-stereo	32
-artitragers	32
-backfired	32
-dated	32
-proper	32
-vital	36
-propel	32
-gourmet	36
-level	100
-malt	32
-dates	32
-bids	34
-deer	32
-belted	32
-structure	32
-blur	32
-boat	36
-index	36
-ogil	4
-fleming	4
-covington	4
-bahamas	4
-doorbells	32
-ate	32
-blue	38
-bronze	32
-messianic	32
-cat-and-mouse	32
-hazards	32
-government-sponsored	34
-appealed	32
-impetus	32
-everyone	46
-pace	36
-provisions	34
-trilling	32
-draw	32
-spiced	32
-reassure	36
-pack	32
-intricacy	32
-renegade	32
-raked	32
-brokers	34
-dram	4
-bidu	4
-welfare	32
-drag	32
-neco	12
-harder	32
-neck	32
-maurier	12
-simpson	12
-superstitions	38
-container	36
-death-penalty	32
-burbank	4
-vindication	32
-reversed	32
-strategic-arms	32
-czm	8
-fully	34
-containment	32
-effective	32
-feed	36
-ineffective	32
-overbuilding	32
-compressor	32
-official	108
-confess	32
-single-engine	32
-bureaucratically	32
-yelling	32
-paratungstate	32
-berlin	4
-ourselves	32
-fpa	8
-why	38
-architects	36
-aware	32
-award	44
-whites	32
-rayner	8
-increased	42
-bti	8
-stronger-than-expected	40
-introductory	32
-reproduce	32
-defended	32
-hyped	32
-rotondo	12
-scandal-ridden	32
-increases	44
-carlo	4
-permit	32
-line	36
-bty	4
-margoshes	4
-garret	4
-illuminate	32
-away	36
-emphasize	36
-global	34
-small-minded	32
-discussed	32
-drake	4
-nearing	32
-liters	32
-deflated	32
-ga.-based	4
-cohens	4
-traveling	32
-cure	32
-profilic	32
-auto-plant	32
-garza	4
-blithe	32
-janet	4
-alternating	32
-palestinian	4
-balkanized	4
-significantly	32
-handsome	32
-model	36
-diego	4
-wake	36
-intones	32
-facilities	100
-unreasonable	32
-finders	32
-raising	44
-belgium	6
-refurbishing	32
-doron	4
-one-tenth	32
-joining	32
-presumptions	32
-ore.-based	4
-television-type	32
-superstations	32
-zico	6
-already	98
-broadcasts	32
-subscriber-line	32
-halliburton	12
-cheers	32
-ribbons	32
-versatile	38
-rope-like	32
-charge	40
-cynthia	4
-routinely	32
-beat	32
-domestic-appliance	32
-summit	42
-volunteered	32
-phased	32
-chry-plym	8
-underwritten	32
-sox	4
-incentive	36
-chartered	36
-sow	32
-ekofisk	4
-heightening	32
-represents	32
-son	34
-pumped	32
-tadeusz	8
-thrown	32
-vocal	36
-kimball	8
-loquacity	32
-catalyst	4
-reno	12
-made	44
-battery-powered	32
-leaned	32
-concluding	32
-bristol	4
-broadcastmail	32
-durenberger	4
-adrar	4
-orleans-style	4
-theresa	8
-government-recognized	32
-apostles	32
-reser	12
-chart-guided	32
-jamming	32
-teeth	36
-into	96
-woodhead	8
-stockpiles	32
-underwear	32
-censor	32
-calif-based	4
-intc	12
-including	34
-roadside	32
-conceived	32
-transformers	4
-tons	32
-nostalgia	32
-conversely	2
-judiciously	32
-poisonous	32
-tony	4
-tong	4
-tone	40
-kouril	12
-construed	32
-existing	32
-plaid	32
-coveted	32
-considered	34
-beseech	32
-frontal	32
-athletes	32
-u.s.-backed	4
-bendjedid	4
-reversion	32
-onetime	32
-memos	36
-cross	44
-airports	32
-hits	32
-goetabanken	4
-persists	32
-eroded	32
-n.j.-based	4
-alarmed	32
-product-liability	32
-misappropriated	32
-uncertain	32
-runners	32
-lowering	34
-assigned	34
-virtual	32
-mcauliffe	8
-ceo	4
-foreign-debt	32
-clothing	32
-algeria	12
-aviator	32
-waukegan	8
-creation	32
-liquid	32
-weighted	32
-cents	32
-imasco	4
-afghan	4
-stablilizers	32
-defense-electronics	32
-tyranny	32
-waves	32
-rust	36
-advest	6
-lee	14
-mercury	4
-crisanti	4
-semi-negative	32
-heart-shaped	32
-juries	32
-representation	32
-unleaded	34
-calder	4
-perot	4
-harris	14
-hint	32
-celestre	8
-governing	36
-silent	36
-animator	32
-laptop	32
-congressmen	32
-outgrowth	32
-adequately	32
-fluidly	32
-abortions	36
-clucks	32
-munoz	12
-supplying	32
-cities/abc	4
-rapport	32
-amsterdam	8
-romance	34
-folly	32
-hasbro	4
-novels	36
-meteoric	32
-give	44
-statesmanlike	32
-pages	32
-urgings	32
-lindner	8
-blitz	32
-hardest	2
-guinea	32
-gonzalez	4
-kidder	14
-clear	44
-old-fashioned	32
-belt	36
-prove	34
-gain	44
-dangling	32
-reproducing	32
-kevin	6
-shotgun	32
-bell	46
-bramco	4
-nondurable	40
-atfc	8
-outcry	32
-clean	36
-tools	36
-television-newsroom	4
-supersedes	32
-reinhardt	4
-santa	12
-houghton	4
-political	98
-unified	36
-pounds	32
-accustomed	32
-presence	32
-achieving	34
-escalate	4
-unimaginable	32
-managers	36
-wits	32
-coffee	46
-mistress	32
-resorts	44
-marketeering	32
-intends	32
-kohlberg	4
-high-intensity	32
-polk	4
-trade	108
-excludes	32
-polo	36
-parody	32
-armageddon	4
-excluded	32
-stresses	32
-noisy	32
-clubs	32
-outlook	36
-absolve	32
-pipe	32
-chagrined	34
-pacts	36
-guards	32
-abrasives	32
-guaranty	4
-drusilla	6
-mailbox	32
-computer-aided	32
-authorization	32
-interbank	36
-excise	32
-accomplishments	32
-jerry	4
-alderman	4
-hootch	32
-whoop	32
-sported	32
-feds	32
-operationally	32
-suggest	32
-impelled	32
-withered	32
-tripling	32
-dinette	32
-libya	4
-chattanooga	8
-sailer	8
-accelerator	32
-coolidge	12
-dollars	96
-lorimar-telepictures	8
-disagreeing	32
-killing	32
-nazi	4
-arturo	4
-farther	32
-toyota	8
-fundamentals	32
-democrats	14
-slogan	32
-allow	32
-climbing	32
-freres	4
-aegean	4
-routes	32
-nicholas	4
-mom	32
-convenient	32
-abdicating	32
-vous	36
-fascination	32
-routed	4
-stringfellow	4
-analysis	32
-dubofsky	4
-corps	32
-priest	32
-lessening	32
-jungle	32
-fort	12
-himself	32
-reflect	96
-latest	44
-deterring	32
-chirac	4
-appoint	32
-backs	32
-perfection	32
-ford	14
-zooming	32
-problem	40
-gnp	4
-form	44
-exported	32
-caution	32
-wheeling	12
-qualify	36
-class	36
-fork	32
-boutique	34
-unsharklike	32
-ravages	32
-federal	110
-shake	32
-blustein	4
-emigrate	32
-integon	6
-shaky	32
-peace-loving	32
-silo	4
-screamers	32
-sufficient	34
-do-it-yourself	34
-astonishing	32
-seekers	32
-stimulating	32
-around	44
-selwyn	4
-danube	4
-fondy	8
-lean	34
-married	36
-guardia	4
-rudani	4
-consumption	98
-expansion-related	32
-leon	6
-khoo	4
-smuggling	32
-edmund	4
-intermediary	32
-adair	12
-anything	32
-paving	32
-intertwined	32
-cheaper	36
-congreso	4
-fonda	4
-ing	4
-cynical	32
-journalist	32
-ind	4
-oil	110
-rest	36
-bearing	32
-keran	4
-pih	4
-inn	4
-windfall	32
-ink	32
-mcentee	8
-ini	4
-fails	44
-headaches	32
-foreign	46
-divest	32
-int	4
-ins	4
-secord	12
-chemical	44
-embarked	32
-babylonian	2
-motorbiking	32
-journalism	32
-devaluation	32
-baghdad	4
-uncollectable	32
-dnb	4
-man	36
-pig	32
-verified	32
-barakat	4
-writing	32
-fungi	36
-mar	4
-islam	4
-informative	32
-forefathers	32
-withdrawals	32
-submersible	96
-arctic	4
-hoping	34
-some	110
-catskills	4
-close	44
-rial	8
-llx	4
-damp	32
-reopen	32
-presumably	34
-alcohol-related	32
-silver	36
-australian-based	4
-stun	4
-faster-growing	32
-circus	32
-pronouncing	32
-figuring	32
-pierced	32
-rentals	32
-fixed	32
-annuities	32
-residual	32
-staged	32
-labor	46
-cng	4
-southwest	14
-experimental	32
-bode	32
-excel	6
-susan	6
-affects	32
-eisenstadt	8
-discontinuing	32
-covertible	4
-bonus	32
-madcap	10
-expertise	32
-professional	38
-disappears	32
-uniforms	32
-zehli	4
-bogeymen	32
-overproducing	32
-yields	34
-flights	36
-laszlo	4
-confidential	34
-fasteners	32
-boycotts	32
-essentially	32
-assignment	32
-places	34
-mid-may	32
-well-done	32
-trimmed	96
-nov.	14
-lurks	32
-climax	32
-clara	4
-curb	36
-snowstorms	32
-s	36
-discretion	32
-adm.	6
-list	32
-asserting	32
-insolvency	32
-unexpected	32
-exponent	32
-transports	32
-yourself	32
-lacy	38
-reagan	14
-wildly	32
-corruption	32
-impact	36
-singapore-based	4
-flx	4
-fly	32
-pont	4
-frey	12
-doctorate	32
-hereabouts	32
-pretoria	4
-perhaps	46
-drum-shaped	32
-suburban	32
-flu	32
-mainframes	32
-offered	96
-clossey	4
-aged	32
-disorders	32
-mysteries	32
-averaged	32
-sneakers	32
-anatoly	4
-tighter	32
-vista-based	4
-spontaneously	32
-detrick	4
-aftershocks	32
-proximity	32
-truffles	38
-memory-chip	32
-fundamentally	32
-hatcher	4
-sorry	32
-pursued	32
-layoff	32
-tire	46
-letters	40
-microphone	32
-outside	102
-b-week	16
-levels	36
-hollywood	4
-mascots	32
-thrived	32
-regrettable	32
-multiplied	32
-afonso	4
-storming	32
-morin	4
-goldsmith	12
-nato	4
-carrington	4
-ziemer	8
-branford	4
-restitution	32
-results	58
-mmr	8
-returns	44
-drummer-athletes	32
-kleenex	4
-reporting	32
-nutritious	32
-decade	32
-declassified	32
-aficionados	32
-fence-sitting	32
-goetzl	12
-absent	32
-grabs	32
-washington-based	6
-news-service	32
-nickeling	32
-elected	100
-proponents	34
-roughed	32
-impeding	32
-teacher	32
-teaches	32
-critique	32
-gangster-and-love	32
-effigy	32
-subjects	32
-inclined	32
-cotton	44
-sensibility	32
-irons	32
-myl	8
-irony	32
-unfair	32
-tronic	4
-cowboy	32
-feature	32
-squad	32
-kicking	32
-jaap	4
-spin	32
-committing	32
-neves	4
-erratically	4
-slowest	32
-saturated	32
-raw-materials	32
-nuclear-weapons	32
-hesitant	32
-iranians	4
-scrap	32
-reduced	96
-damon	4
-diceon	14
-nurtured	32
-sonnenblick	4
-mae	4
-apparatus	32
-organizing	32
-sacrosanct	32
-reduces	36
-two-decade-old	32
-sachs/kidder	4
-sufferers	32
-full-scale	32
-mediation	32
-commitment	96
-sat	32
-hired	32
-accommodations	32
-saw	32
-european-looking	4
-narrative	32
-dependable	32
-greenmail	32
-started	32
-bleak	32
-say	100
-sad	42
-incurring	32
-sag	32
-rabushka	12
-lampooned	32
-fiasco	36
-sam	12
-san	12
-sao	4
-sai	8
-examining	32
-preview	32
-wear	32
-paramount	36
-finely	32
-trouble	36
-tcw	8
-s.c.	4
-nebulous	32
-skeptics	34
-whinney	4
-two-month	32
-illegal	32
-tubes	32
-deficiencies	32
-temperature	32
-weak	38
-stoicism	32
-industrie	4
-undershoot	32
-surpluses	32
-accomplishes	32
-starter	36
-discouraged	32
-ratepayers	32
-humphries	8
-footlick	4
-attracts	32
-crisis	36
-from	110
-x-rayed	4
-errors	32
-christianity	4
-inflationary	32
-carry-forwards	32
-assisted	34
-riots	32
-compare	32
-objects	32
-french-sounding	4
-train	32
-spawn	32
-kiichi	4
-diapers	32
-daffynition	8
-forwarders	32
-pix	4
-roots	32
-saudia	4
-bathing	32
-percent	40
-moonlighting	4
-sunbathing	32
-preoccupied	32
-refers	32
-skims	32
-frisch	12
-disputed	36
-sean	4
-saudis	4
-constitutes	32
-decorous	32
-conceal	32
-cacophonous	32
-machinery	38
-scoff	32
-introspective	32
-teradata	8
-logistics	32
-ammunition	32
-refuse	32
-francisville	8
-paralyzed	32
-write-down	32
-baltimore-based	6
-diverted	40
-japanese-designed	4
-lazard	4
-pessimistic	36
-unpleasant	32
-gousha	12
-vigor	32
-acquaint	32
-naval	40
-matrimony	32
-major	46
-egyptian	4
-biggest	38
-gringo	32
-western	46
-billion	36
-knowingly	32
-choate	4
-evenhandedly	32
-tax-cutting	32
-heyday	32
-further	46
-absorbing	4
-explicitly	32
-potatoes	32
-dollar-denominated	32
-counsels	32
-bankers	102
-fibers	32
-finesse	32
-diagrams	32
-coffee-growing	32
-believes	96
-nyt	8
-carnivorous	32
-listeners	32
-stadiums	32
-underestimate	32
-nyn	8
-recyclers	32
-phyb	8
-precipitous	32
-standby	32
-nye	4
-finkielstain	12
-stoughton	8
-retaliation	96
-believed	32
-cards	36
-commit	32
-bent	32
-motive	32
-crow	36
-et	32
-inexperienced	32
-immigrants	32
-revenue-raising	32
-campaigns	36
-adviser	36
-witty	32
-panache	2
-bend	36
-cabinet-level	32
-obligations	32
-concentration	32
-first-ranked	32
-world-wide	104
-complaints	32
-swearingen	4
-kline	4
-corporately	32
-appointee	32
-siegels	4
-nest	36
-topped	32
-pakistanis	4
-couldn	36
-streamline	32
-a-330	4
-rumors	32
-parenting	32
-franyo	4
-op-ed	32
-coal-burning	32
-unmitigated	32
-assume	34
-celebrity-laden	32
-adolescents	32
-rockford	6
-frequency	32
-t.nor	4
-witchlike	32
-inspected	32
-sellers	32
-mosle	4
-ious	4
-online	4
-made-for-television	32
-heroic	32
-marina	4
-agent	32
-fidel	4
-readied	32
-vierdanck	4
-hausfeld	12
-mcdonald	12
-graduates	32
-celebrate	32
-harpsichord	32
-gentlemanly	32
-jolted	32
-shantytown	32
-ministry	36
-fide	32
-governmental	32
-graduated	32
-engineer	32
-wholesale	36
-cash-and-stock	32
-thumb	32
-ballistic	32
-m.b.a.	4
-deliverable	32
-robitussin	4
-well-paid	32
-harden	36
-even	46
-tennis-racquets	32
-milburn	4
-iss	40
-coin	32
-sucralose	34
-seniority	32
-depicting	32
-torrent	32
-three-quarters	32
-discord	32
-return-on-investment	32
-beach	36
-obtained	32
-resent	32
-vodavi	4
-ata	4
-heileman	8
-mandated	32
-matched	32
-britain	14
-excessive	32
-inflation	46
-weakness	32
-advises	32
-tacit	32
-tcoma	4
-lotteries	32
-liquids	32
-swingers	4
-ransom	32
-griswold	8
-unshaken	32
-investigative	32
-formal	32
-dabbling	32
-college	46
-advised	32
-precision	32
-divestitures	32
-reasonable	32
-downgrading	32
-jockey	36
-wstf	8
-homeowner	32
-garrasi	4
-byzantine	4
-litigators	32
-red-haired	32
-hangover	32
-subpoena	32
-elvis	4
-side	100
-carajas	4
-pre-apprentices	32
-struggles	32
-authorities	98
-anti-lawyer	32
-compulsively	32
-alberta	4
-costly	34
-struggling	32
-florio	8
-ok.	4
-attache	32
-pedal	32
-vigorous	32
-neanderthals	4
-reportedly	32
-senator	104
-yutaka	4
-carted	32
-publication	32
-securities-services	32
-catastrophic	32
-misguided	32
-fee-for-service	32
-furthermore	34
-abbes	4
-cswc	4
-unforeseen	32
-harmed	32
-steel-ingot	32
-accrue	32
-accost	32
-white-collar	32
-lsi	12
-terrorism	32
-sweet	36
-dancing	32
-elkay	4
-stages	32
-regions	40
-swiss-based	4
-uk	4
-uh	32
-formerly	98
-ut	4
-anti-black	32
-thatcher	4
-us	34
-up	46
-terrorist	32
-j&l	4
-expression	32
-non-recoverable	32
-taffeta	32
-jonathan	4
-litchfield	4
-h.m.	4
-disconnected	32
-matters	36
-oki	4
-rusch	4
-chunk	32
-chamber	36
-limitations	32
-leverage	32
-whims	32
-dixieland	4
-adequate	32
-analyst	100
-attachments	32
-friendships	32
-confrontationism	32
-rugged	32
-bamberger	12
-computervision	14
-maintenance	32
-melamed	12
-multifamily	32
-soviets	4
-decisive	32
-investment-bank	64
-hires	32
-undertook	32
-academia	32
-outline	36
-video	38
-rosenthal	4
-economies	32
-biocraft	10
-widening	44
-falsify	32
-holiday	46
-leahy	12
-confectionery	32
-meierfeld	8
-cooking	32
-gtx	8
-dryburgh	8
-spigot	32
-nearly	46
-distasteful	32
-hamburger	32
-crop	32
-bart	4
-preoccupies	32
-installations	32
-heightened	32
-headlining	32
-blissful	32
-questioner	32
-squander	32
-dispersed	32
-soldier	32
-edged	32
-enacted	32
-superscope	4
-cgp	4
-freed	32
-reasonably	32
-clarence	2
-bohai	4
-indignities	32
-amos	12
-cge	4
-edges	4
-shaken	32
-peelings	32
-amok	32
-dancers	32
-sheen	4
-dominating	32
-foreign-policy	32
-hire	32
-freer	32
-none-too-subtle	32
-kra	8
-centurion	4
-chileans	4
-lafayette	8
-station	36
-education	46
-nauslar	12
-decides	32
-christmas	4
-overproduction	32
-negotiated	32
-g.	4
-cie.	6
-discover	32
-rohatyn	4
-vaccines	32
-decided	36
-conciliatory	32
-accurate	32
\ No newline at end of file
diff --git a/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt b/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt
deleted file mode 100644
index 478fd833..00000000
--- a/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-most
-he
-since
-so
-both
-these
-it
-nevertheless
-this
-indeed
-however
-instead
-under
-similarly
-some
-though
-while
-when
-in
-despite
-although
-nonetheless
-thus
-there
-if
-the
-nor
-separately
-moreover
-but
-they
-yet
-many
-according
-sales
-among
-meanwhile
-even
-i
\ No newline at end of file
diff --git a/scripts/generate.sh b/scripts/generate.sh
deleted file mode 100644
index be0bee9b..00000000
--- a/scripts/generate.sh
+++ /dev/null
@@ -1 +0,0 @@
-python3 -m graphgen.generate --config_file graphgen/configs/graphgen_config.yaml --output_dir cache/
diff --git a/scripts/judge.sh b/scripts/judge.sh
deleted file mode 100644
index f6fc134e..00000000
--- a/scripts/judge.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-python3 -m graphgen.judge  --input cache \
-                  --output cache/output/new_graph.graphml \
diff --git a/setup.py b/setup.py
index 3dee7f8b..2b30f103 100644
--- a/setup.py
+++ b/setup.py
@@ -1,30 +1,31 @@
+# pylint: skip-file
 import os
 
 from setuptools import find_packages, setup
 
 pwd = os.path.dirname(__file__)
-version_file = 'graphgen/version.py'
+version_file = "graphgen/_version.py"
 
 
 def readme():
-    with open(os.path.join(pwd, 'README.md'), encoding='utf-8') as f:
+    with open(os.path.join(pwd, "README.md"), encoding="utf-8") as f:
         content = f.read()
     return content
 
 
 def get_version():
-    with open(os.path.join(pwd, version_file), 'r') as f:
-        exec(compile(f.read(), version_file, 'exec'))
-    return locals()['__version__']
+    with open(os.path.join(pwd, version_file), "r") as f:
+        exec(compile(f.read(), version_file, "exec"))
+    return locals()["__version__"]
 
 
 def read_requirements():
     lines = []
-    with open('requirements.txt', 'r') as f:
+    with open("requirements.txt", "r") as f:
         for line in f.readlines():
-            if line.startswith('#'):
+            if line.startswith("#"):
                 continue
-            if 'textract' in line:
+            if "textract" in line:
                 continue
             if len(line) > 0:
                 lines.append(line)
@@ -33,32 +34,27 @@ def read_requirements():
 
 install_packages = read_requirements()
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     setup(
-        name='graphg',
+        name="graphg",
         version=get_version(),
-        url='https://github.com/open-sciencelab/GraphGen',
-        description=  # noqa E251
-        'GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation',  # noqa E501
+        url="https://github.com/open-sciencelab/GraphGen",
+        description="GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation",
         long_description=readme(),
-        long_description_content_type='text/markdown',
-        author='open-sciencelab',
-        author_email='open-sciencelab@pjlab.org.cn',
+        long_description_content_type="text/markdown",
+        author="open-sciencelab",
+        author_email="open-sciencelab@pjlab.org.cn",
         packages=find_packages(exclude=["models"]),
-        package_data={
-            'GraphGen': ['configs/*']
-        },
+        package_data={"GraphGen": ["configs/*"]},
         include_package_data=True,
         install_requires=install_packages,
         classifiers=[
-            'Programming Language :: Python :: 3.8',
-            'Programming Language :: Python :: 3.9',
-            'Programming Language :: Python :: 3.10',
-            'Programming Language :: Python :: 3.11',
-            'Programming Language :: Python :: 3.12',
-            'Intended Audience :: Developers',
-            'Intended Audience :: Education',
-            'Intended Audience :: Science/Research',
+            "Programming Language :: Python :: 3.10",
+            "Programming Language :: Python :: 3.11",
+            "Programming Language :: Python :: 3.12",
+            "Intended Audience :: Developers",
+            "Intended Audience :: Education",
+            "Intended Audience :: Science/Research",
         ],
-        entry_points={'console_scripts': ['graphgen=graphgen.generate:main']},
+        entry_points={"console_scripts": ["graphg=graphgen.generate:main"]},
     )
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/e2e_tests/__init__.py b/tests/e2e_tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/e2e_tests/conftest.py b/tests/e2e_tests/conftest.py
new file mode 100644
index 00000000..39cc4100
--- /dev/null
+++ b/tests/e2e_tests/conftest.py
@@ -0,0 +1,63 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def run_generate_test(tmp_path: Path, config_name: str):
+    """
+    Run the generate test with the given configuration file and temporary path.
+
+    Args:
+        tmp_path: pytest temporary path
+        config_name: configuration file name (e.g. "atomic_config.yaml")
+
+    Returns:
+        tuple: (run_folder, json_files[0])
+    """
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / config_name
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists(), f"{data_root} does not exist"
+    run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"
+
+    return run_folder, json_files[0]
+
diff --git a/tests/e2e_tests/test_generate_aggregated.py b/tests/e2e_tests/test_generate_aggregated.py
new file mode 100644
index 00000000..faebf3ac
--- /dev/null
+++ b/tests/e2e_tests/test_generate_aggregated.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+from .conftest import run_generate_test
+
+
+def test_generate_aggregated(tmp_path: Path):
+    run_generate_test(tmp_path, "aggregated_config.yaml")
diff --git a/tests/e2e_tests/test_generate_atomic.py b/tests/e2e_tests/test_generate_atomic.py
new file mode 100644
index 00000000..26e47532
--- /dev/null
+++ b/tests/e2e_tests/test_generate_atomic.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+from .conftest import run_generate_test
+
+
+def test_generate_atomic(tmp_path: Path):
+    run_generate_test(tmp_path, "atomic_config.yaml")
diff --git a/tests/e2e_tests/test_generate_cot.py b/tests/e2e_tests/test_generate_cot.py
new file mode 100644
index 00000000..b1ee74d9
--- /dev/null
+++ b/tests/e2e_tests/test_generate_cot.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+from .conftest import run_generate_test
+
+
+def test_generate_cot(tmp_path: Path):
+    run_generate_test(tmp_path, "cot_config.yaml")
diff --git a/tests/e2e_tests/test_generate_multi_hop.py b/tests/e2e_tests/test_generate_multi_hop.py
new file mode 100644
index 00000000..709f5918
--- /dev/null
+++ b/tests/e2e_tests/test_generate_multi_hop.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+from .conftest import run_generate_test
+
+
+def test_generate_multi_hop(tmp_path: Path):
+    run_generate_test(tmp_path, "multi_hop_config.yaml")
diff --git a/tests/e2e_tests/test_generate_vqa.py b/tests/e2e_tests/test_generate_vqa.py
new file mode 100644
index 00000000..796bc286
--- /dev/null
+++ b/tests/e2e_tests/test_generate_vqa.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+from .conftest import run_generate_test
+
+
+def test_generate_vqa(tmp_path: Path):
+    run_generate_test(tmp_path, "vqa_config.yaml")
diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration_tests/models/llm/api/test_http_client.py b/tests/integration_tests/models/llm/api/test_http_client.py
new file mode 100644
index 00000000..d2996d1c
--- /dev/null
+++ b/tests/integration_tests/models/llm/api/test_http_client.py
@@ -0,0 +1,143 @@
+# pylint: disable=protected-access
+import math
+
+import pytest
+
+from graphgen.models.llm.api.http_client import HTTPClient
+
+
+class DummyTokenizer:
+    def encode(self, text: str):
+        # simple tokenization: split on spaces
+        return text.split()
+
+
+class _MockResponse:
+    def __init__(self, data):
+        self._data = data
+
+    def raise_for_status(self):
+        return None
+
+    async def json(self):
+        return self._data
+
+
+class _PostCtx:
+    def __init__(self, data):
+        self._resp = _MockResponse(data)
+
+    async def __aenter__(self):
+        return self._resp
+
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+
+
+class MockSession:
+    def __init__(self, data):
+        self._data = data
+        self.closed = False
+
+    def post(self, *args, **kwargs):
+        return _PostCtx(self._data)
+
+    async def close(self):
+        self.closed = True
+
+
+class DummyLimiter:
+    def __init__(self):
+        self.calls = []
+
+    async def wait(self, *args, **kwargs):
+        self.calls.append((args, kwargs))
+
+
+@pytest.mark.asyncio
+async def test_generate_answer_records_usage_and_uses_limiters():
+    # arrange
+    data = {
+        "choices": [{"message": {"content": "Hello <think>world</think>!"}}],
+        "usage": {"prompt_tokens": 3, "completion_tokens": 2, "total_tokens": 5},
+    }
+    client = HTTPClient(model="m", base_url="http://test")
+    client._session = MockSession(data)
+    client.tokenizer = DummyTokenizer()
+    client.system_prompt = "sys"
+    client.temperature = 0.0
+    client.top_p = 1.0
+    client.max_tokens = 10
+    client.filter_think_tags = lambda s: s.replace("<think>", "").replace(
+        "</think>", ""
+    )
+    rpm = DummyLimiter()
+    tpm = DummyLimiter()
+    client.rpm = rpm
+    client.tpm = tpm
+    client.request_limit = True
+
+    # act
+    out = await client.generate_answer("hi", history=["u1", "a1"])
+
+    # assert
+    assert out == "Hello world!"
+    assert client.token_usage[-1] == {
+        "prompt_tokens": 3,
+        "completion_tokens": 2,
+        "total_tokens": 5,
+    }
+    assert len(rpm.calls) == 1
+    assert len(tpm.calls) == 1
+
+
+@pytest.mark.asyncio
+async def test_generate_topk_per_token_parses_logprobs():
+    # arrange
+    # create two token items with top_logprobs
+    data = {
+        "choices": [
+            {
+                "logprobs": {
+                    "content": [
+                        {
+                            "token": "A",
+                            "logprob": math.log(0.6),
+                            "top_logprobs": [
+                                {"token": "A", "logprob": math.log(0.6)},
+                                {"token": "B", "logprob": math.log(0.4)},
+                            ],
+                        },
+                        {
+                            "token": "B",
+                            "logprob": math.log(0.2),
+                            "top_logprobs": [
+                                {"token": "B", "logprob": math.log(0.2)},
+                                {"token": "C", "logprob": math.log(0.8)},
+                            ],
+                        },
+                    ]
+                }
+            }
+        ]
+    }
+    client = HTTPClient(model="m", base_url="http://test")
+    client._session = MockSession(data)
+    client.tokenizer = DummyTokenizer()
+    client.system_prompt = None
+    client.temperature = 0.0
+    client.top_p = 1.0
+    client.max_tokens = 10
+    client.topk_per_token = 2
+
+    # act
+    tokens = await client.generate_topk_per_token("hi", history=[])
+
+    # assert
+    assert len(tokens) == 2
+    # check probabilities and top_candidates
+    assert abs(tokens[0].prob - 0.6) < 1e-9
+    assert abs(tokens[1].prob - 0.2) < 1e-9
+    assert len(tokens[0].top_candidates) == 2
+    assert tokens[0].top_candidates[0].text == "A"
+    assert tokens[0].top_candidates[1].text == "B"
diff --git a/tests/integration_tests/models/llm/api/test_ollama_client.py b/tests/integration_tests/models/llm/api/test_ollama_client.py
new file mode 100644
index 00000000..b20bc44c
--- /dev/null
+++ b/tests/integration_tests/models/llm/api/test_ollama_client.py
@@ -0,0 +1,91 @@
+# pylint: disable=redefined-outer-name
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from graphgen.models import OllamaClient
+
+
+# ----------------- fixture -----------------
+@pytest.fixture
+def mock_ollama_pkg():
+    """
+    mock ollama
+    """
+    ollama_mock = MagicMock()
+    ollama_mock.AsyncClient = AsyncMock
+    with patch.dict("sys.modules", {"ollama": ollama_mock}):
+        yield ollama_mock
+
+
+@pytest.fixture
+def ollama_client(mock_ollama_pkg) -> OllamaClient:
+    """
+    Returns a default-configured OllamaClient with client.chat mocked
+    """
+    cli = OllamaClient(model="gemma3", base_url="http://test:11434")
+    cli.tokenizer = MagicMock()
+    cli.tokenizer.encode = MagicMock(side_effect=lambda x: x.split())
+    cli.client.chat = AsyncMock(
+        return_value={
+            "message": {"content": "hi from ollama"},
+            "prompt_eval_count": 10,
+            "eval_count": 5,
+        }
+    )
+    return cli
+
+
+@pytest.mark.asyncio
+async def test_generate_answer_basic(ollama_client: OllamaClient):
+    ans = await ollama_client.generate_answer("hello")
+    assert ans == "hi from ollama"
+    ollama_client.client.chat.assert_awaited_once()
+    call = ollama_client.client.chat.call_args
+    assert call.kwargs["model"] == "gemma3"
+    assert call.kwargs["messages"][-1]["content"] == "hello"
+    assert call.kwargs["stream"] is False
+
+
+@pytest.mark.asyncio
+async def test_generate_answer_with_history(ollama_client: OllamaClient):
+    hist = [{"role": "user", "content": "prev"}]
+    await ollama_client.generate_answer("now", history=hist)
+    msgs = ollama_client.client.chat.call_args.kwargs["messages"]
+    assert msgs[-2]["content"] == "prev"
+    assert msgs[-1]["content"] == "now"
+
+
+@pytest.mark.asyncio
+async def test_token_usage_recorded(ollama_client: OllamaClient):
+    await ollama_client.generate_answer("test")
+    assert len(ollama_client.token_usage) == 1
+    assert ollama_client.token_usage[0]["prompt_tokens"] == 10
+    assert ollama_client.token_usage[0]["completion_tokens"] == 5
+    assert ollama_client.token_usage[0]["total_tokens"] == 15
+
+
+@pytest.mark.asyncio
+async def test_rpm_tpm_limiter_called(ollama_client: OllamaClient):
+    ollama_client.request_limit = True
+    with patch.object(ollama_client.rpm, "wait", AsyncMock()) as rpm_mock, patch.object(
+        ollama_client.tpm, "wait", AsyncMock()
+    ) as tpm_mock:
+
+        await ollama_client.generate_answer("limited")
+        rpm_mock.assert_awaited_once_with(silent=True)
+        tpm_mock.assert_awaited_once_with(
+            ollama_client.max_tokens + len("limited".split()), silent=True
+        )
+
+
+def test_import_error_when_ollama_missing():
+    with patch.dict("sys.modules", {"ollama": None}):
+        with pytest.raises(ImportError, match="Ollama SDK is not installed"):
+            OllamaClient()
+
+
+@pytest.mark.asyncio
+async def test_generate_inputs_prob_not_implemented(ollama_client: OllamaClient):
+    with pytest.raises(NotImplementedError):
+        await ollama_client.generate_inputs_prob("any")
diff --git a/tests/integration_tests/models/llm/local/test_hf_wrapper.py b/tests/integration_tests/models/llm/local/test_hf_wrapper.py
new file mode 100644
index 00000000..ae23ce11
--- /dev/null
+++ b/tests/integration_tests/models/llm/local/test_hf_wrapper.py
@@ -0,0 +1,43 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper
+
+
+@pytest.fixture(autouse=True)
+def mock_hf(monkeypatch):
+    mock_tokenizer = MagicMock()
+    mock_tokenizer.pad_token = None
+    mock_tokenizer.eos_token = "<EOS>"
+    mock_tokenizer.eos_token_id = 0
+    mock_tokenizer.decode.return_value = "hello"
+    mock_tokenizer.encode.return_value = [1, 2, 3]
+    monkeypatch.setattr(
+        "graphgen.models.llm.local.hf_wrapper.AutoTokenizer.from_pretrained",
+        lambda *a, **kw: mock_tokenizer,
+    )
+
+    mock_model = MagicMock()
+    mock_model.device = "cpu"
+    mock_model.generate.return_value = MagicMock(
+        __getitem__=lambda s, k: [0, 1, 2, 3], shape=(1, 4)
+    )
+    mock_model.eval.return_value = None
+    monkeypatch.setattr(
+        "graphgen.models.llm.local.hf_wrapper.AutoModelForCausalLM.from_pretrained",
+        lambda *a, **kw: mock_model,
+    )
+
+    monkeypatch.setattr(
+        "graphgen.models.llm.local.hf_wrapper.torch.no_grad", MagicMock()
+    )
+
+    return mock_tokenizer, mock_model
+
+
+@pytest.mark.asyncio
+async def test_generate_answer():
+    wrapper = HuggingFaceWrapper("fake-model")
+    result = await wrapper.generate_answer("hi")
+    assert isinstance(result, str)
diff --git a/tests/integration_tests/models/partitioner/test_bfs_partitioner.py b/tests/integration_tests/models/partitioner/test_bfs_partitioner.py
new file mode 100644
index 00000000..48558cc1
--- /dev/null
+++ b/tests/integration_tests/models/partitioner/test_bfs_partitioner.py
@@ -0,0 +1,81 @@
+import tempfile
+
+import pytest
+
+from graphgen.bases.datatypes import Community
+from graphgen.models import BFSPartitioner, NetworkXStorage
+
+
+@pytest.mark.asyncio
+async def test_empty_graph():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="empty")
+        partitioner = BFSPartitioner()
+        communities = await partitioner.partition(storage, max_units_per_community=5)
+        assert communities == []
+
+
+@pytest.mark.asyncio
+async def test_single_node():
+    nodes = [("A", {"desc": "alone"})]
+    edges = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="single_node")
+
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = BFSPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=5
+        )
+        assert len(communities) == 1
+        assert communities[0].nodes == ["A"]
+        assert communities[0].edges == []
+
+
+@pytest.mark.asyncio
+async def test_small_graph():
+    """
+    0 - 1 - 2
+    |   |   |
+    3 - 4 - 5
+    6 nodes & 7 edges, max_units=4 => at least 3 communities
+    """
+    nodes = [(str(i), {"desc": f"node{i}"}) for i in range(6)]
+    edges = [
+        ("0", "1", {"desc": "e01"}),
+        ("1", "2", {"desc": "e12"}),
+        ("0", "3", {"desc": "e03"}),
+        ("1", "4", {"desc": "e14"}),
+        ("2", "5", {"desc": "e25"}),
+        ("3", "4", {"desc": "e34"}),
+        ("4", "5", {"desc": "e45"}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_graph")
+
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = BFSPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4
+        )
+
+        assert len(communities) <= 5
+
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update(c.edges)
+
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
diff --git a/tests/integration_tests/models/partitioner/test_dfs_partitioner.py b/tests/integration_tests/models/partitioner/test_dfs_partitioner.py
new file mode 100644
index 00000000..536e139a
--- /dev/null
+++ b/tests/integration_tests/models/partitioner/test_dfs_partitioner.py
@@ -0,0 +1,89 @@
+import tempfile
+
+import pytest
+
+from graphgen.bases.datatypes import Community
+from graphgen.models import DFSPartitioner, NetworkXStorage
+
+
+@pytest.mark.asyncio
+async def test_empty_graph():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(
+            working_dir=tmpdir,
+            namespace="empty",
+        )
+        partitioner = DFSPartitioner()
+        communities = await partitioner.partition(storage, max_units_per_community=5)
+        assert communities == []
+
+
+@pytest.mark.asyncio
+async def test_single_node():
+    nodes = [("A", {"desc": "alone"})]
+    edges = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(
+            working_dir=tmpdir,
+            namespace="single_node",
+        )
+
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = DFSPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=5
+        )
+        assert len(communities) == 1
+        assert communities[0].nodes == ["A"]
+        assert communities[0].edges == []
+
+
+@pytest.mark.asyncio
+async def test_small_graph():
+    """
+    0 - 1 - 2
+    |   |   |
+    3 - 4 - 5
+    6 nodes & 7 edges, max_units=4 => 3 communities
+    """
+    nodes = [(str(i), {"desc": f"node{i}"}) for i in range(6)]
+    edges = [
+        ("0", "1", {"desc": "e01"}),
+        ("1", "2", {"desc": "e12"}),
+        ("0", "3", {"desc": "e03"}),
+        ("1", "4", {"desc": "e14"}),
+        ("2", "5", {"desc": "e25"}),
+        ("3", "4", {"desc": "e34"}),
+        ("4", "5", {"desc": "e45"}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(
+            working_dir=tmpdir,
+            namespace="small_graph",
+        )
+
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = DFSPartitioner()
+
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4
+        )
+
+        assert len(communities) <= 5
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update(c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
diff --git a/tests/integration_tests/models/partitioner/test_ece_partitioner.py b/tests/integration_tests/models/partitioner/test_ece_partitioner.py
new file mode 100644
index 00000000..93fc2ea1
--- /dev/null
+++ b/tests/integration_tests/models/partitioner/test_ece_partitioner.py
@@ -0,0 +1,202 @@
+import tempfile
+
+import pytest
+
+from graphgen.bases.datatypes import Community
+from graphgen.models import ECEPartitioner, NetworkXStorage
+
+
+@pytest.mark.asyncio
+async def test_ece_empty_graph():
+    """ECE partitioning on an empty graph should return an empty community list."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="empty")
+        partitioner = ECEPartitioner()
+        communities = await partitioner.partition(
+            storage, max_units_per_community=5, unit_sampling="random"
+        )
+        assert communities == []
+
+
+@pytest.mark.asyncio
+async def test_ece_single_node():
+    """A single node must be placed in exactly one community under any edge-sampling strategy."""
+    nodes = [("A", {"desc": "alone", "length": 10, "loss": 0.1})]
+
+    for strategy in ("random", "min_loss", "max_loss"):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            storage = NetworkXStorage(
+                working_dir=tmpdir, namespace=f"single_{strategy}"
+            )
+            for nid, ndata in nodes:
+                await storage.upsert_node(nid, ndata)
+
+            partitioner = ECEPartitioner()
+            communities: list[Community] = await partitioner.partition(
+                storage, max_units_per_community=5, unit_sampling=strategy
+            )
+            assert len(communities) == 1
+            assert communities[0].nodes == ["A"]
+            assert communities[0].edges == []
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_random():
+    """
+    2x3 grid graph:
+        0 — 1 — 2
+        |   |   |
+        3 — 4 — 5
+    6 nodes & 7 edges, max_units=4  =>  at least 3 communities expected with random sampling.
+    """
+    nodes = [(str(i), {"desc": f"node{i}", "length": 10}) for i in range(6)]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.1, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.2, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.3, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.4, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.5, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.6, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.7, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_random")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="random"
+        )
+
+        # Basic integrity checks
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_min_loss():
+    """
+    Same grid graph, but using min_loss sampling.
+    Edges with lower loss should be preferred during community expansion.
+    """
+    nodes = [
+        (str(i), {"desc": f"node{i}", "length": 10, "loss": i * 0.1}) for i in range(6)
+    ]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.05, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.10, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.15, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.25, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.30, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.35, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_min")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="min_loss"
+        )
+
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_max_loss():
+    """
+    Same grid graph, but using max_loss sampling.
+    Edges with higher loss should be preferred during community expansion.
+    """
+    nodes = [
+        (str(i), {"desc": f"node{i}", "length": 10, "loss": (5 - i) * 0.1})
+        for i in range(6)
+    ]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.35, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.30, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.25, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.15, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.10, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.05, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_max")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="max_loss"
+        )
+
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_max_tokens_limit():
+    """Ensure max_tokens_per_community is respected."""
+    # node id -> data
+    node_data = {"A": {"length": 3000}, "B": {"length": 3000}, "C": {"length": 3000}}
+    # edge list
+    edges = [("A", "B", {"loss": 0.1, "length": 2000})]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="token_limit")
+        for nid, ndata in node_data.items():
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage,
+            max_units_per_community=10,
+            max_tokens_per_community=5000,  # 1 node (3000) + 1 edge (2000) = 5000
+            unit_sampling="random",
+        )
+
+        # With a 5000-token budget we need at least two communities
+        assert len(communities) >= 2
+
+        # helper: quick edge lookup
+        edge_lens = {(u, v): d["length"] for u, v, d in edges}
+        edge_lens.update({(v, u): d["length"] for u, v, d in edges})  # undirected
+
+        for c in communities:
+            node_tokens = sum(node_data[n]["length"] for n in c.nodes)
+            edge_tokens = sum(edge_lens[e] for e in c.edges)
+            assert node_tokens + edge_tokens <= 5000
diff --git a/tests/integration_tests/models/reader/test_mineru_parser.py b/tests/integration_tests/models/reader/test_mineru_parser.py
new file mode 100644
index 00000000..c30863f8
--- /dev/null
+++ b/tests/integration_tests/models/reader/test_mineru_parser.py
@@ -0,0 +1,41 @@
+import os
+from pathlib import Path
+
+from graphgen.models.reader.pdf_reader import MinerUParser
+
+
+def test_check_bin():
+    """Ensure mineru CLI is available."""
+    MinerUParser()
+
+
+def test_parse_pdf():
+    """Parse a real PDF and verify basic structure."""
+    repo_root = Path(__file__).resolve().parents[4]
+
+    sample_pdf = os.path.join(repo_root, "resources", "input_examples", "pdf_demo.pdf")
+    parser = MinerUParser()
+    blocks = parser.parse_pdf(sample_pdf, device="cpu", method="auto")
+
+    assert isinstance(blocks, list)
+    assert blocks, "At least one block expected"
+
+    text_blocks = [b for b in blocks if b.get("type") == "text"]
+    assert text_blocks, "No text block found"
+
+    first = text_blocks[0]
+    assert "text" in first
+    assert isinstance(first["content"], str)
+    assert first["content"].strip(), "Empty text content"
+
+
+def test_empty_pdf(tmp_path: Path) -> None:
+    """Gracefully handle blank PDF."""
+    empty = tmp_path / "empty.pdf"
+    empty.write_bytes(b"%PDF-1.4\n%%EOF\n")  # syntactically valid, no content
+
+    parser = MinerUParser()
+    blocks = parser.parse_pdf(empty, device="cpu")
+
+    # Empty list or list with empty text block are both acceptable
+    assert isinstance(blocks, list)
diff --git a/tests/integration_tests/models/splitter/test_character_splitter.py b/tests/integration_tests/models/splitter/test_character_splitter.py
new file mode 100644
index 00000000..5d41547d
--- /dev/null
+++ b/tests/integration_tests/models/splitter/test_character_splitter.py
@@ -0,0 +1,30 @@
+import pytest
+
+from graphgen.models.splitter.character_splitter import CharacterSplitter
+
+
+@pytest.mark.parametrize(
+    "text,chunk_size,chunk_overlap,expected",
+    [
+        (
+            "This is a test.\n\nThis is only a test.\n\nIn the event of an actual emergency...",
+            25,
+            5,
+            [
+                "This is a test.",
+                "This is only a test.",
+                "In the event of an actual emergency...",
+            ],
+        ),
+    ],
+)
+def test_character_splitter(text, chunk_size, chunk_overlap, expected):
+    splitter = CharacterSplitter(
+        separator="\n\n",
+        is_separator_regex=False,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        keep_separator=False,
+    )
+    chunks = splitter.split_text(text)
+    assert chunks == expected
diff --git a/tests/integration_tests/models/splitter/test_markdown_splitter.py b/tests/integration_tests/models/splitter/test_markdown_splitter.py
new file mode 100644
index 00000000..8d02e1b2
--- /dev/null
+++ b/tests/integration_tests/models/splitter/test_markdown_splitter.py
@@ -0,0 +1,40 @@
+from graphgen.models.splitter.markdown_splitter import MarkdownTextRefSplitter
+
+
+def test_split_markdown_structures():
+    md = (
+        "# Header1\n\n"
+        "Some introduction here.\n\n"
+        "## Header2\n\n"
+        "```python\nprint('hello')\n```\n"
+        "Paragraph under code block.\n\n"
+        "***\n"
+        "### Header3\n\n"
+        "More text after horizontal rule.\n\n"
+        "#### Header4\n\n"
+        "Final paragraph."
+    )
+
+    splitter = MarkdownTextRefSplitter(
+        chunk_size=120,
+        chunk_overlap=0,
+        keep_separator=True,
+        is_separator_regex=True,
+    )
+    chunks = splitter.split_text(md)
+    assert len(chunks) > 1
+
+    for chk in chunks:
+        assert len(chk) <= 120
+
+    assert any("## Header2" in c for c in chunks)
+    assert any("***" in c for c in chunks)
+    assert any("```" in c for c in chunks)
+
+
+def test_split_size_less_than_single_char():
+    """Edge case: chunk_size is smaller than any segment; should still return the original text."""
+    short = "# A\n\nB"
+    splitter = MarkdownTextRefSplitter(chunk_size=1, chunk_overlap=0)
+    chunks = splitter.split_text(short)
+    assert "".join(chunks) == short
diff --git a/tests/integration_tests/models/splitter/test_recursive_character_splitter.py b/tests/integration_tests/models/splitter/test_recursive_character_splitter.py
new file mode 100644
index 00000000..7d104f3e
--- /dev/null
+++ b/tests/integration_tests/models/splitter/test_recursive_character_splitter.py
@@ -0,0 +1,49 @@
+from graphgen.models.splitter.recursive_character_splitter import (
+    ChineseRecursiveTextSplitter,
+    RecursiveCharacterSplitter,
+)
+
+
+def test_split_english_paragraph():
+    text = (
+        "Natural language processing (NLP) is a subfield of linguistics, computer science, "
+        "and artificial intelligence. It focuses on the interaction between computers and "
+        "humans through natural language. The ultimate objective of NLP is to read, decipher, "
+        "understand, and make sense of human languages in a manner that is valuable.\n\n"
+        "Most NLP techniques rely on machine learning."
+    )
+
+    splitter = RecursiveCharacterSplitter(
+        chunk_size=150,
+        chunk_overlap=0,
+        keep_separator=True,
+        is_separator_regex=False,
+    )
+    chunks = splitter.split_text(text)
+
+    assert len(chunks) > 1
+    for chk in chunks:
+        assert len(chk) <= 150
+
+
+def test_split_chinese_with_punctuation():
+    text = (
+        "自然语言处理是人工智能的重要分支。它研究能实现人与计算机之间用自然语言"
+        "进行有效通信的各种理论和方法！融合语言学、计算机科学、数学于一体？"
+        "近年来，深度学习极大推动了NLP的发展；Transformer、BERT、GPT等模型层出不穷，"
+        "，，，甚至出现了多模态大模型。\n\n"
+        "未来，NLP 将继续向通用人工智能迈进。"
+    )
+
+    splitter = ChineseRecursiveTextSplitter(
+        chunk_size=60,
+        chunk_overlap=0,
+        keep_separator=True,
+        is_separator_regex=True,
+    )
+    chunks = splitter.split_text(text)
+
+    assert len(chunks) > 1
+    for chk in chunks:
+        assert len(chk) <= 60
+        assert "\n\n\n" not in chk
diff --git a/tests/integration_tests/test_engine.py b/tests/integration_tests/test_engine.py
new file mode 100644
index 00000000..6a389e42
--- /dev/null
+++ b/tests/integration_tests/test_engine.py
@@ -0,0 +1,78 @@
+import pytest
+
+from graphgen.engine import Context, Engine, op
+
+engine = Engine(max_workers=2)
+
+
+def test_simple_dag(capsys):
+    """Verify the DAG A->B/C->D execution results and print order."""
+    ctx = Context()
+
+    @op("A")
+    def op_a(self, ctx):
+        print("Running A")
+        ctx.set("A", 1)
+
+    @op("B", deps=["A"])
+    def op_b(self, ctx):
+        print("Running B")
+        ctx.set("B", ctx.get("A") + 1)
+
+    @op("C", deps=["A"])
+    def op_c(self, ctx):
+        print("Running C")
+        ctx.set("C", ctx.get("A") + 2)
+
+    @op("D", deps=["B", "C"])
+    def op_d(self, ctx):
+        print("Running D")
+        ctx.set("D", ctx.get("B") + ctx.get("C"))
+
+    # Explicitly list the nodes to run; avoid relying on globals().
+    ops = [op_a, op_b, op_c, op_d]
+    engine.run(ops, ctx)
+
+    # Assert final results.
+    assert ctx["A"] == 1
+    assert ctx["B"] == 2
+    assert ctx["C"] == 3
+    assert ctx["D"] == 5
+
+    # Assert print order: A must run before B and C; D must run after B and C.
+    captured = capsys.readouterr().out.strip().splitlines()
+    assert "Running A" in captured
+    assert "Running B" in captured
+    assert "Running C" in captured
+    assert "Running D" in captured
+
+    a_idx = next(i for i, line in enumerate(captured) if "Running A" in line)
+    b_idx = next(i for i, line in enumerate(captured) if "Running B" in line)
+    c_idx = next(i for i, line in enumerate(captured) if "Running C" in line)
+    d_idx = next(i for i, line in enumerate(captured) if "Running D" in line)
+
+    assert a_idx < b_idx
+    assert a_idx < c_idx
+    assert d_idx > b_idx
+    assert d_idx > c_idx
+
+
+def test_cyclic_detection():
+    """A cyclic dependency should raise ValueError."""
+    ctx = Context()
+
+    @op("X", deps=["Y"])
+    def op_x(self, ctx):
+        pass
+
+    @op("Y", deps=["X"])
+    def op_y(self, ctx):
+        pass
+
+    ops = [op_x, op_y]
+    with pytest.raises(ValueError, match="Cyclic dependencies"):
+        engine.run(ops, ctx)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 00000000..a02a6a37
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,3 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"
diff --git a/webui/app.py b/webui/app.py
index 7e6f4ebb..98b02601 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -1,27 +1,26 @@
+import json
 import os
 import sys
-import json
 import tempfile
+from importlib.resources import files
 
-import pandas as pd
 import gradio as gr
-
-from gradio_i18n import Translate, gettext as _
-
-from base import GraphGenParams
-from test_api import test_api_connection
-from cache_utils import setup_workspace, cleanup_workspace
-from count_tokens import count_tokens
-
-# pylint: disable=wrong-import-position
-root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+import ray
+from dotenv import load_dotenv
+
+from graphgen.engine import Engine
+from graphgen.operators import operators
+from graphgen.utils import CURRENT_LOGGER_VAR, set_logger
+from webui.base import WebuiParams
+from webui.i18n import Translate
+from webui.i18n import gettext as _
+from webui.test_api import test_api_connection
+from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_workspace
+
+root_dir = files("webui").parent
 sys.path.append(root_dir)
 
-from graphgen.graphgen import GraphGen
-from graphgen.models import OpenAIModel, Tokenizer, TraverseStrategy
-from graphgen.models.llm.limitter import RPM, TPM
-from graphgen.utils import set_logger
-
+load_dotenv()
 
 css = """
 .center-row {
@@ -32,220 +31,213 @@
 """
 
 
-def init_graph_gen(config: dict, env: dict) -> GraphGen:
-    # Set up working directory
-    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
-
-    set_logger(log_file, if_stream=False)
-    graph_gen = GraphGen(
-        working_dir=working_dir
-    )
-
-    # Set up LLM clients
-    graph_gen.synthesizer_llm_client = OpenAIModel(
-        model_name=env.get("SYNTHESIZER_MODEL", ""),
-        base_url=env.get("SYNTHESIZER_BASE_URL", ""),
-        api_key=env.get("SYNTHESIZER_API_KEY", ""),
-        request_limit=True,
-        rpm= RPM(env.get("RPM", 1000)),
-        tpm= TPM(env.get("TPM", 50000)),
-    )
-
-    graph_gen.trainee_llm_client = OpenAIModel(
-        model_name=env.get("TRAINEE_MODEL", ""),
-        base_url=env.get("TRAINEE_BASE_URL", ""),
-        api_key=env.get("TRAINEE_API_KEY", ""),
-        request_limit=True,
-        rpm= RPM(env.get("RPM", 1000)),
-        tpm= TPM(env.get("TPM", 50000)),
-    )
-
-    graph_gen.tokenizer_instance = Tokenizer(
-        config.get("tokenizer", "cl100k_base"))
-
-    strategy_config = config.get("traverse_strategy", {})
-    graph_gen.traverse_strategy = TraverseStrategy(
-        qa_form=config.get("qa_form"),
-        expand_method=strategy_config.get("expand_method"),
-        bidirectional=strategy_config.get("bidirectional"),
-        max_extra_edges=strategy_config.get("max_extra_edges"),
-        max_tokens=strategy_config.get("max_tokens"),
-        max_depth=strategy_config.get("max_depth"),
-        edge_sampling=strategy_config.get("edge_sampling"),
-        isolated_node_strategy=strategy_config.get("isolated_node_strategy"),
-        loss_strategy=str(strategy_config.get("loss_strategy"))
-    )
+def _get_partition_params(params: WebuiParams):
+    method = params.partition_method
+    if method == "dfs":
+        return {
+            "max_units_per_community": params.dfs_max_units,
+        }
+    if method == "bfs":
+        return {
+            "max_units_per_community": params.bfs_max_units,
+        }
+    if method == "leiden":
+        return {
+            "max_size": params.leiden_max_size,
+            "use_lcc": params.leiden_use_lcc,
+            "random_seed": params.leiden_random_seed,
+        }
+    # ece
+    return {
+        "max_units_per_community": params.ece_max_units,
+        "min_units_per_community": params.ece_min_units,
+        "max_tokens_per_community": params.ece_max_tokens,
+        "unit_sampling": params.ece_unit_sampling,
+    }
 
-    return graph_gen
 
 # pylint: disable=too-many-statements
-def run_graphgen(params, progress=gr.Progress()):
-    def sum_tokens(client):
-        return sum(u["total_tokens"] for u in client.token_usage)
-
-    config = {
-        "if_trainee_model": params.if_trainee_model,
-        "input_file": params.input_file,
-        "tokenizer": params.tokenizer,
-        "qa_form": params.qa_form,
-        "web_search": False,
-        "quiz_samples": params.quiz_samples,
-        "traverse_strategy": {
-            "bidirectional": params.bidirectional,
-            "expand_method": params.expand_method,
-            "max_extra_edges": params.max_extra_edges,
-            "max_tokens": params.max_tokens,
-            "max_depth": params.max_depth,
-            "edge_sampling": params.edge_sampling,
-            "isolated_node_strategy": params.isolated_node_strategy,
-            "loss_strategy": params.loss_strategy
+def run_graphgen(params: WebuiParams, progress=gr.Progress()):
+    # 1. Setup Workspace
+    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
+    driver_logger = set_logger(log_file, "GraphGeb", if_stream=True)
+    CURRENT_LOGGER_VAR.set(driver_logger)
+
+    # 2. Setup Environment Variables for Ray Actors/LLM Init
+    # The refactored code relies on env vars in graphgen/common/init_llm.py
+    os.environ["SYNTHESIZER_BACKEND"] = "openai_api"  # Assuming OpenAI compatible API
+    os.environ["SYNTHESIZER_BASE_URL"] = params.synthesizer_url
+    os.environ["SYNTHESIZER_API_KEY"] = params.api_key
+    os.environ["SYNTHESIZER_MODEL"] = params.synthesizer_model
+    os.environ["RPM"] = str(params.rpm)
+    os.environ["TPM"] = str(params.tpm)
+    os.environ["TOKENIZER_MODEL"] = params.tokenizer
+
+    if params.if_trainee_model:
+        os.environ["TRAINEE_BACKEND"] = "openai_api"
+        os.environ["TRAINEE_BASE_URL"] = params.trainee_url
+        os.environ["TRAINEE_API_KEY"] = params.trainee_api_key
+        os.environ["TRAINEE_MODEL"] = params.trainee_model
+
+    # 3. Construct Pipeline Configuration (DAG)
+    nodes = [
+        {
+            "id": "read",
+            "op_name": "read",
+            "type": "source",
+            "dependencies": [],
+            "params": {
+                "input_path": [params.upload_file],
+            },
         },
-        "chunk_size": params.chunk_size,
-    }
-
-    env = {
-        "SYNTHESIZER_BASE_URL": params.synthesizer_url,
-        "SYNTHESIZER_MODEL": params.synthesizer_model,
-        "TRAINEE_BASE_URL": params.trainee_url,
-        "TRAINEE_MODEL": params.trainee_model,
-        "SYNTHESIZER_API_KEY": params.api_key,
-        "TRAINEE_API_KEY": params.trainee_api_key,
-        "RPM": params.rpm,
-        "TPM": params.tpm,
-    }
+        {
+            "id": "chunk",
+            "op_name": "chunk",
+            "type": "map_batch",
+            "dependencies": ["read"],
+            "execution_params": {"replicas": 1},
+            "params": {
+                "chunk_size": params.chunk_size,
+                "chunk_overlap": params.chunk_overlap,
+            },
+        },
+        {
+            "id": "build_kg",
+            "op_name": "build_kg",
+            "type": "map_batch",
+            "dependencies": ["chunk"],
+            "execution_params": {"replicas": 1, "batch_size": 128},
+        },
+    ]
+
+    last_node_id = "build_kg"
+
+    # Optional: Quiz and Judge
+    if params.if_trainee_model:
+        nodes.append(
+            {
+                "id": "quiz",
+                "op_name": "quiz",
+                "type": "aggregate",  # QuizService uses aggregate in config
+                "dependencies": ["build_kg"],
+                "execution_params": {"replicas": 1, "batch_size": 128},
+                "params": {
+                    "quiz_samples": params.quiz_samples,
+                    "concurrency_limit": 200,
+                },
+            }
+        )
 
-    # Test API connection
-    test_api_connection(env["SYNTHESIZER_BASE_URL"],
-                        env["SYNTHESIZER_API_KEY"], env["SYNTHESIZER_MODEL"])
-    if config['if_trainee_model']:
-        test_api_connection(env["TRAINEE_BASE_URL"],
-                            env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"])
+        nodes.append(
+            {
+                "id": "judge",
+                "op_name": "judge",
+                "type": "map_batch",
+                "dependencies": ["quiz"],
+                "execution_params": {"replicas": 1, "batch_size": 128},
+            }
+        )
+        last_node_id = "judge"
+
+    # Node: Partition
+    nodes.append(
+        {
+            "id": "partition",
+            "op_name": "partition",
+            "type": "aggregate",  # PartitionService uses aggregate
+            "dependencies": [last_node_id],
+            "params": {
+                "method": params.partition_method,
+                "method_params": _get_partition_params(params),
+            },
+        }
+    )
 
-    # Initialize GraphGen
-    graph_gen = init_graph_gen(config, env)
-    graph_gen.clear()
+    # Node: Generate
+    nodes.append(
+        {
+            "id": "generate",
+            "op_name": "generate",
+            "type": "map_batch",
+            "dependencies": ["partition"],
+            "execution_params": {"replicas": 1, "batch_size": 128},
+            "params": {
+                "method": params.mode,
+                "data_format": params.data_format,
+            },
+        }
+    )
 
-    graph_gen.progress_bar = progress
+    config = {"global_params": {"working_dir": working_dir}, "nodes": nodes}
 
     try:
-        # Load input data
-        file = config['input_file']
-        if isinstance(file, list):
-            file = file[0]
-
-        data = []
-
-        if file.endswith(".jsonl"):
-            data_type = "raw"
-            with open(file, "r", encoding='utf-8') as f:
-                data.extend(json.loads(line) for line in f)
-        elif file.endswith(".json"):
-            data_type = "chunked"
-            with open(file, "r", encoding='utf-8') as f:
-                data.extend(json.load(f))
-        elif file.endswith(".txt"):
-            # 读取文件后根据chunk_size转成raw格式的数据
-            data_type = "raw"
-            content = ""
-            with open(file, "r", encoding='utf-8') as f:
-                lines = f.readlines()
-                for line in lines:
-                    content += line.strip() + " "
-            size = int(config.get("chunk_size", 512))
-            chunks = [
-                content[i:i + size] for i in range(0, len(content), size)
-            ]
-            data.extend([{"content": chunk} for chunk in chunks])
-        else:
-            raise ValueError(f"Unsupported file type: {file}")
-
-        # Process the data
-        graph_gen.insert(data, data_type)
-
-        if config['if_trainee_model']:
-            # Generate quiz
-            graph_gen.quiz(max_samples=config['quiz_samples'])
-
-            # Judge statements
-            graph_gen.judge()
+        # 4. Initialize and Run Engine
+        # Initialize Ray if not already running (Engine handles this mostly, but good for safety)
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True, log_to_driver=True)
+
+        engine = Engine(config, operators)
+
+        # Start with an empty dataset to kick off the pipeline
+        ds = ray.data.from_items([])
+
+        # Execute pipeline
+        results = engine.execute(ds)
+
+        # 5. Process Output
+        # Extract the result from the 'generate' node
+        if "generate" in results:
+            result_ds = results["generate"]
+
+            # Create a temporary file to save the output
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+            ) as tmpfile:
+                # Iterate over rows and write to file
+                for row in result_ds.iter_rows():
+                    json.dump(row, tmpfile, ensure_ascii=False)
+                    tmpfile.write("\n")
+                output_file = tmpfile.name
         else:
-            graph_gen.traverse_strategy.edge_sampling = "random"
-            # Skip judge statements
-            graph_gen.judge(skip=True)
-
-        # Traverse graph
-        graph_gen.traverse()
-
-        # Save output
-        output_data = graph_gen.qa_storage.data
-        with tempfile.NamedTemporaryFile(
-                mode="w",
-                suffix=".jsonl",
-                delete=False,
-                encoding="utf-8") as tmpfile:
-            json.dump(output_data, tmpfile, ensure_ascii=False)
-            output_file = tmpfile.name
-
-        synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
-        trainee_tokens = sum_tokens(graph_gen.trainee_llm_client) if config['if_trainee_model'] else 0
-        total_tokens = synthesizer_tokens + trainee_tokens
-
-        data_frame = params.token_counter
-        try:
-            _update_data = [
-                [
-                    data_frame.iloc[0, 0],
-                    data_frame.iloc[0, 1],
-                    str(total_tokens)
-                ]
-            ]
-            new_df = pd.DataFrame(
-                _update_data,
-                columns=data_frame.columns
-            )
-            data_frame = new_df
+            raise gr.Error("Generation step failed to produce output.")
 
-        except Exception as e:
-            raise gr.Error(f"DataFrame operation error: {str(e)}")
+        # Note: Dynamic token counting from distributed actors is not directly available
+        # via client properties in the new architecture. We return the estimated stats from input.
 
-        return output_file, gr.DataFrame(label='Token Stats',
-                         headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
-                         datatype="str",
-                         interactive=False,
-                         value=data_frame,
-                         visible=True,
-                         wrap=True)
+        return output_file, params.token_counter
 
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
 
     finally:
         # Clean up workspace
-        cleanup_workspace(graph_gen.working_dir)
+        cleanup_workspace(working_dir)  # Optional: keep for debugging or enable
 
-with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
-               css=css) as demo):
+
+with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
     # Header
-    gr.Image(value=os.path.join(root_dir, 'resources', 'images', 'logo.png'),
-             label="GraphGen Banner",
-             elem_id="banner",
-             interactive=False,
-             container=False,
-             show_download_button=False,
-             show_fullscreen_button=False)
+    gr.Image(
+        value=os.path.join(root_dir, "resources", "images", "logo.png"),
+        label="GraphGen Banner",
+        elem_id="banner",
+        interactive=False,
+        container=False,
+        show_download_button=False,
+        show_fullscreen_button=False,
+    )
     lang_btn = gr.Radio(
         choices=[
             ("English", "en"),
             ("简体中文", "zh"),
         ],
         value="en",
-        # label=_("Language"),
         render=False,
         container=False,
         elem_classes=["center-row"],
     )
 
-    gr.HTML("""
+    gr.HTML(
+        """
     <div style="display: flex; gap: 8px; margin-left: auto; align-items: center; justify-content: center;">
         <a href="https://github.com/open-sciencelab/GraphGen/releases">
             <img src="https://img.shields.io/badge/Version-v0.1.0-blue" alt="Version">
@@ -256,132 +248,298 @@ def sum_tokens(client):
         <a href="https://github.com/open-sciencelab/GraphGen/issues/10">
             <img src="https://img.shields.io/github/stars/open-sciencelab/GraphGen?style=social" alt="GitHub Stars">
         </a>
-        <a href="https://github.com/open-sciencelab/GraphGen/tree/main/resources">
+        <a href="https://arxiv.org/abs/2505.20416">
             <img src="https://img.shields.io/badge/arXiv-pdf-yellow" alt="arXiv">
         </a>
     </div>
-    """)
+    """
+    )
     with Translate(
-            os.path.join(root_dir, 'webui', 'translation.json'),
-            lang_btn,
-            placeholder_langs=["en", "zh"],
-            persistant=
-            False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
+        os.path.join(root_dir, "webui", "translation.json"),
+        lang_btn,
+        placeholder_langs=["en", "zh"],
+        persistant=False,
     ):
         lang_btn.render()
 
-        gr.Markdown(
-            value = "# " + _("Title") + "\n\n" + \
-                "### [GraphGen](https://github.com/open-sciencelab/GraphGen) " + _("Intro")
-        )
+        gr.Markdown(value=_("Title") + _("Intro"))
 
-        if_trainee_model = gr.Checkbox(label=_("Use Trainee Model"),
-                                       value=False,
-                                       interactive=True)
+        if_trainee_model = gr.Checkbox(
+            label=_("Use Trainee Model"), value=False, interactive=True
+        )
 
         with gr.Accordion(label=_("Model Config"), open=False):
-            synthesizer_url = gr.Textbox(label="Synthesizer URL",
-                                  value="https://api.siliconflow.cn/v1",
-                                  info=_("Synthesizer URL Info"),
-                                  interactive=True)
-            synthesizer_model = gr.Textbox(label="Synthesizer Model",
-                                           value="Qwen/Qwen2.5-7B-Instruct",
-                                           info=_("Synthesizer Model Info"),
-                                           interactive=True)
-            trainee_url = gr.Textbox(label="Trainee URL",
-                                        value="https://api.siliconflow.cn/v1",
-                                        info=_("Trainee URL Info"),
-                                        interactive=True,
-                                        visible=if_trainee_model.value is True)
+            tokenizer = gr.Textbox(
+                label="Tokenizer", value="cl100k_base", interactive=True
+            )
+            synthesizer_url = gr.Textbox(
+                label="Synthesizer URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Synthesizer URL Info"),
+                interactive=True,
+            )
+            synthesizer_model = gr.Textbox(
+                label="Synthesizer Model",
+                value="Qwen/Qwen2.5-7B-Instruct",
+                info=_("Synthesizer Model Info"),
+                interactive=True,
+            )
+            trainee_url = gr.Textbox(
+                label="Trainee URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Trainee URL Info"),
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
             trainee_model = gr.Textbox(
                 label="Trainee Model",
                 value="Qwen/Qwen2.5-7B-Instruct",
                 info=_("Trainee Model Info"),
                 interactive=True,
-                visible=if_trainee_model.value is True)
+                visible=if_trainee_model.value is True,
+            )
             trainee_api_key = gr.Textbox(
-                    label=_("SiliconCloud Token for Trainee Model"),
-                    type="password",
-                    value="",
-                    info="https://cloud.siliconflow.cn/account/ak",
-                    visible=if_trainee_model.value is True)
-
-
-        with gr.Accordion(label=_("Generation Config"), open=False):
-            chunk_size = gr.Slider(label="Chunk Size",
-                                   minimum=256,
-                                   maximum=4096,
-                                   value=512,
-                                   step=256,
-                                   interactive=True)
-            tokenizer = gr.Textbox(label="Tokenizer",
-                                   value="cl100k_base",
-                                   interactive=True)
-            qa_form = gr.Radio(choices=["atomic", "multi_hop", "aggregated"],
-                               label="QA Form",
-                               value="aggregated",
-                               interactive=True)
-            quiz_samples = gr.Number(label="Quiz Samples",
-                                     value=2,
-                                     minimum=1,
-                                     interactive=True,
-                                     visible=if_trainee_model.value is True)
-            bidirectional = gr.Checkbox(label="Bidirectional",
-                                        value=True,
-                                        interactive=True)
-
-            expand_method = gr.Radio(choices=["max_width", "max_tokens"],
-                                     label="Expand Method",
-                                     value="max_tokens",
-                                     interactive=True)
-            max_extra_edges = gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=5,
-                label="Max Extra Edges",
-                step=1,
-                interactive=True,
-                visible=expand_method.value == "max_width")
-            max_tokens = gr.Slider(minimum=64,
-                                   maximum=1024,
-                                   value=256,
-                                   label="Max Tokens",
-                                   step=64,
-                                   interactive=True,
-                                   visible=(expand_method.value
-                                            != "max_width"))
-
-            max_depth = gr.Slider(minimum=1,
-                                  maximum=5,
-                                  value=2,
-                                  label="Max Depth",
-                                  step=1,
-                                  interactive=True)
-            edge_sampling = gr.Radio(
-                choices=["max_loss", "min_loss", "random"],
-                label="Edge Sampling",
-                value="max_loss",
-                interactive=True,
-                visible=if_trainee_model.value is True)
-            isolated_node_strategy = gr.Radio(choices=["add", "ignore"],
-                                              label="Isolated Node Strategy",
-                                              value="ignore",
-                                              interactive=True)
-            loss_strategy = gr.Radio(choices=["only_edge", "both"],
-                                     label="Loss Strategy",
-                                     value="only_edge",
-                                     interactive=True)
+                label=_("SiliconFlow Token for Trainee Model"),
+                type="password",
+                value="",
+                info="https://cloud.siliconflow.cn/account/ak",
+                visible=if_trainee_model.value is True,
+            )
 
         with gr.Row(equal_height=True):
             with gr.Column(scale=3):
                 api_key = gr.Textbox(
-                    label=_("SiliconCloud Token"),
+                    label=_("SiliconFlow Token"),
                     type="password",
                     value="",
-                    info="https://cloud.siliconflow.cn/account/ak")
+                    info=_("SiliconFlow Token Info"),
+                )
             with gr.Column(scale=1):
                 test_connection_btn = gr.Button(_("Test Connection"))
 
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Blocks():
+                    with gr.Row(equal_height=True):
+                        with gr.Column(scale=1):
+                            upload_file = gr.File(
+                                label=_("Upload File"),
+                                file_count="single",
+                                file_types=[".txt", ".json", ".jsonl", ".csv"],
+                                interactive=True,
+                            )
+                            examples_dir = os.path.join(root_dir, "webui", "examples")
+                            gr.Examples(
+                                examples=[
+                                    [os.path.join(examples_dir, "txt_demo.txt")],
+                                    [os.path.join(examples_dir, "jsonl_demo.jsonl")],
+                                    [os.path.join(examples_dir, "json_demo.json")],
+                                    [os.path.join(examples_dir, "csv_demo.csv")],
+                                ],
+                                inputs=upload_file,
+                                label=_("Example Files"),
+                                examples_per_page=4,
+                            )
+            with gr.Column(scale=1):
+                with gr.Blocks():
+                    preview_code = gr.Code(
+                        label=_("File Preview"),
+                        interactive=False,
+                        visible=True,
+                        elem_id="preview_code",
+                    )
+                    preview_df = gr.DataFrame(
+                        label=_("File Preview"),
+                        interactive=False,
+                        visible=False,
+                        elem_id="preview_df",
+                    )
+
+        with gr.Accordion(label=_("Split Config"), open=False):
+            gr.Markdown(value=_("Split Config Info"))
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=1):
+                    chunk_size = gr.Slider(
+                        label=_("Chunk Size"),
+                        minimum=256,
+                        maximum=4096,
+                        value=1024,
+                        step=256,
+                        interactive=True,
+                        info=_("Chunk Size Info"),
+                    )
+                with gr.Column(scale=1):
+                    chunk_overlap = gr.Slider(
+                        label=_("Chunk Overlap"),
+                        minimum=0,
+                        maximum=500,
+                        value=100,
+                        step=100,
+                        interactive=True,
+                        info=_("Chunk Overlap Info"),
+                    )
+
+        with gr.Accordion(
+            label=_("Quiz & Judge Config"), open=False, visible=False
+        ) as quiz_accordion:
+            gr.Markdown(value=_("Quiz & Judge Config Info"))
+            quiz_samples = gr.Number(
+                label=_("Quiz Samples"),
+                value=2,
+                minimum=1,
+                interactive=True,
+                info=_("Quiz Samples Info"),
+            )
+
+        with gr.Accordion(label=_("Partition Config"), open=False):
+            gr.Markdown(value=_("Partition Config Info"))
+
+            partition_method = gr.Dropdown(
+                label=_("Partition Method"),
+                choices=["dfs", "bfs", "ece", "leiden"],
+                value="ece",
+                interactive=True,
+                info=_("Which algorithm to use for graph partitioning."),
+            )
+
+            # DFS method parameters
+            with gr.Group(visible=False) as dfs_group:
+                gr.Markdown(_("DFS intro"))
+                dfs_max_units = gr.Slider(
+                    label=_("Max Units Per Community"),
+                    minimum=1,
+                    maximum=100,
+                    value=5,
+                    step=1,
+                    interactive=True,
+                    info=_("Max Units Per Community Info"),
+                )
+            # BFS method parameters
+            with gr.Group(visible=False) as bfs_group:
+                gr.Markdown(_("BFS intro"))
+                bfs_max_units = gr.Slider(
+                    label=_("Max Units Per Community"),
+                    minimum=1,
+                    maximum=100,
+                    value=5,
+                    step=1,
+                    interactive=True,
+                    info=_("Max Units Per Community Info"),
+                )
+
+            # Leiden method parameters
+            with gr.Group(visible=False) as leiden_group:
+                gr.Markdown(_("Leiden intro"))
+                leiden_max_size = gr.Slider(
+                    label=_("Maximum Size of Communities"),
+                    minimum=1,
+                    maximum=100,
+                    value=20,
+                    step=1,
+                    interactive=True,
+                    info=_("Maximum Size of Communities Info"),
+                )
+                leiden_use_lcc = gr.Checkbox(
+                    label=_("Use Largest Connected Component"),
+                    value=False,
+                    interactive=True,
+                    info=_("Use Largest Connected Component Info"),
+                )
+                leiden_random_seed = gr.Number(
+                    label=_("Random Seed"),
+                    value=42,
+                    precision=0,
+                    interactive=True,
+                    info=_("Random Seed Info"),
+                )
+
+            # ECE method parameters
+            with gr.Group(visible=True) as ece_group:
+                gr.Markdown(_("ECE intro"))
+                ece_max_units = gr.Slider(
+                    label=_("Max Units Per Community"),
+                    minimum=1,
+                    maximum=100,
+                    value=20,
+                    step=1,
+                    interactive=True,
+                    info=_("Max Units Per Community Info"),
+                )
+                ece_min_units = gr.Slider(
+                    label=_("Min Units Per Community"),
+                    minimum=1,
+                    maximum=100,
+                    value=3,
+                    step=1,
+                    interactive=True,
+                    info=_("Min Units Per Community Info"),
+                )
+                ece_max_tokens = gr.Slider(
+                    label=_("Max Tokens Per Community"),
+                    minimum=512,
+                    maximum=20_480,
+                    value=10_240,
+                    step=512,
+                    interactive=True,
+                    info=_("Max Tokens Per Community Info"),
+                )
+                ece_unit_sampling = gr.Radio(
+                    label=_("Unit Sampling Strategy"),
+                    choices=["random"],
+                    value="random",
+                    interactive=True,
+                    info=_("Unit Sampling Strategy Info"),
+                )
+
+            def toggle_partition_params(method):
+                dfs = method == "dfs"
+                bfs = method == "bfs"
+                leiden = method == "leiden"
+                ece = method == "ece"
+                return (
+                    gr.update(visible=dfs),  # dfs_group
+                    gr.update(visible=bfs),  # bfs_group
+                    gr.update(visible=leiden),  # leiden_group
+                    gr.update(visible=ece),  # ece_group
+                )
+
+            partition_method.change(
+                fn=toggle_partition_params,
+                inputs=partition_method,
+                outputs=[dfs_group, bfs_group, leiden_group, ece_group],
+            )
+
+        with gr.Accordion(label=_("Generation Config"), open=False):
+            gr.Markdown(value=_("Generation Config Info"))
+            mode = gr.Radio(
+                choices=["atomic", "multi_hop", "aggregated", "CoT"],
+                label=_("Mode"),
+                value="aggregated",
+                interactive=True,
+                info=_("Mode Info"),
+            )
+            data_format = gr.Radio(
+                choices=["Alpaca", "Sharegpt", "ChatML"],
+                label=_("Output Data Format"),
+                value="Alpaca",
+                interactive=True,
+                info=_("Output Data Format Info"),
+            )
+
+        with gr.Blocks():
+            token_counter = gr.DataFrame(
+                label="Token Stats",
+                headers=[
+                    "Source Text Token Count",
+                    "Estimated Token Usage",
+                    "Token Used",
+                ],
+                datatype="str",
+                interactive=False,
+                visible=False,
+                wrap=True,
+            )
+
         with gr.Blocks():
             with gr.Row(equal_height=True):
                 with gr.Column():
@@ -392,7 +550,8 @@ def sum_tokens(client):
                         value=1000,
                         step=100,
                         interactive=True,
-                        visible=True)
+                        visible=True,
+                    )
                 with gr.Column():
                     tpm = gr.Slider(
                         label="TPM",
@@ -401,41 +560,16 @@ def sum_tokens(client):
                         value=50000,
                         step=1000,
                         interactive=True,
-                        visible=True)
-
-
-        with gr.Blocks():
-            with gr.Row(equal_height=True):
-                with gr.Column(scale=1):
-                    upload_file = gr.File(
-                        label=_("Upload File"),
-                        file_count="single",
-                        file_types=[".txt", ".json", ".jsonl"],
-                        interactive=True,
-                    )
-                    examples_dir = os.path.join(root_dir, 'webui', 'examples')
-                    gr.Examples(examples=[
-                        [os.path.join(examples_dir, "txt_demo.txt")],
-                        [os.path.join(examples_dir, "raw_demo.jsonl")],
-                        [os.path.join(examples_dir, "chunked_demo.json")],
-                    ],
-                                inputs=upload_file,
-                                label=_("Example Files"),
-                                examples_per_page=3)
-                with gr.Column(scale=1):
-                    output = gr.File(
-                        label="Output(See Github FAQ)",
-                        file_count="single",
-                        interactive=False,
+                        visible=True,
                     )
 
         with gr.Blocks():
-            token_counter = gr.DataFrame(label='Token Stats',
-                         headers=["Source Text Token Count", "Estimated Token Usage", "Token Used"],
-                         datatype="str",
-                         interactive=False,
-                         visible=False,
-                         wrap=True)
+            with gr.Column(scale=1):
+                output = gr.File(
+                    label=_("Output File"),
+                    file_count="single",
+                    interactive=False,
+                )
 
         submit_btn = gr.Button(_("Run GraphGen"))
 
@@ -443,32 +577,48 @@ def sum_tokens(client):
         test_connection_btn.click(
             test_api_connection,
             inputs=[synthesizer_url, api_key, synthesizer_model],
-            outputs=[])
+            outputs=[],
+        )
 
         if if_trainee_model.value:
-            test_connection_btn.click(test_api_connection,
-                                    inputs=[trainee_url, api_key, trainee_model],
-                                    outputs=[])
+            test_connection_btn.click(
+                test_api_connection,
+                inputs=[trainee_url, api_key, trainee_model],
+                outputs=[],
+            )
 
-        expand_method.change(lambda method:
-                             (gr.update(visible=method == "max_width"),
-                              gr.update(visible=method != "max_width")),
-                             inputs=expand_method,
-                             outputs=[max_extra_edges, max_tokens])
+        if_trainee_model.change(
+            lambda use_trainee: [gr.update(visible=use_trainee)] * 4,
+            inputs=if_trainee_model,
+            outputs=[
+                trainee_url,
+                trainee_model,
+                trainee_api_key,
+                quiz_accordion,
+            ],
+        )
 
         if_trainee_model.change(
-            lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
+            lambda on: (
+                gr.update(
+                    choices=["random"]
+                    if not on
+                    else ["random", "max_loss", "min_loss"],
+                    value="random",
+                )
+            ),
             inputs=if_trainee_model,
-            outputs=[trainee_url, trainee_model, quiz_samples, edge_sampling, trainee_api_key])
+            outputs=ece_unit_sampling,
+        )
 
         upload_file.change(
-            lambda x: (gr.update(visible=True)),
-            inputs=[upload_file],
-            outputs=[token_counter],
+            preview_file, inputs=upload_file, outputs=[preview_code, preview_df]
+        ).then(
+            lambda x: gr.update(visible=True), inputs=upload_file, outputs=token_counter
         ).then(
             count_tokens,
             inputs=[upload_file, tokenizer, token_counter],
-            outputs=[token_counter],
+            outputs=token_counter,
         )
 
         # run GraphGen
@@ -479,41 +629,41 @@ def sum_tokens(client):
         )
 
         submit_btn.click(
-            lambda *args: run_graphgen(GraphGenParams(
-                if_trainee_model=args[0],
-                input_file=args[1],
-                tokenizer=args[2],
-                qa_form=args[3],
-                bidirectional=args[4],
-                expand_method=args[5],
-                max_extra_edges=args[6],
-                max_tokens=args[7],
-                max_depth=args[8],
-                edge_sampling=args[9],
-                isolated_node_strategy=args[10],
-                loss_strategy=args[11],
-                synthesizer_url=args[12],
-                synthesizer_model=args[13],
-                trainee_model=args[14],
-                api_key=args[15],
-                chunk_size=args[16],
-                rpm=args[17],
-                tpm=args[18],
-                quiz_samples=args[19],
-                trainee_url=args[20],
-                trainee_api_key=args[21],
-                token_counter=args[22],
-            )),
+            lambda *args: run_graphgen(
+                WebuiParams(**dict(zip(WebuiParams.__annotations__, args)))
+            ),
             inputs=[
-                if_trainee_model, upload_file, tokenizer, qa_form,
-                bidirectional, expand_method, max_extra_edges, max_tokens,
-                max_depth, edge_sampling, isolated_node_strategy,
-                loss_strategy, synthesizer_url, synthesizer_model, trainee_model,
-                api_key, chunk_size, rpm, tpm, quiz_samples, trainee_url, trainee_api_key, token_counter
+                if_trainee_model,
+                upload_file,
+                tokenizer,
+                synthesizer_model,
+                synthesizer_url,
+                trainee_model,
+                trainee_url,
+                api_key,
+                trainee_api_key,
+                chunk_size,
+                chunk_overlap,
+                quiz_samples,
+                partition_method,
+                dfs_max_units,
+                bfs_max_units,
+                leiden_max_size,
+                leiden_use_lcc,
+                leiden_random_seed,
+                ece_max_units,
+                ece_min_units,
+                ece_max_tokens,
+                ece_unit_sampling,
+                mode,
+                data_format,
+                rpm,
+                tpm,
+                token_counter,
             ],
             outputs=[output, token_counter],
         )
 
 if __name__ == "__main__":
     demo.queue(api_open=False, default_concurrency_limit=2)
-    demo.launch(server_name='0.0.0.0')
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
diff --git a/webui/base.py b/webui/base.py
index 32f3ed10..1d20f181 100644
--- a/webui/base.py
+++ b/webui/base.py
@@ -1,31 +1,37 @@
-from dataclasses import dataclass
-from typing import Any
-
-@dataclass
-class GraphGenParams:
-    """
-    GraphGen parameters
-    """
-    if_trainee_model: bool
-    input_file: str
-    tokenizer: str
-    qa_form: str
-    bidirectional: bool
-    expand_method: str
-    max_extra_edges: int
-    max_tokens: int
-    max_depth: int
-    edge_sampling: str
-    isolated_node_strategy: str
-    loss_strategy: str
-    synthesizer_url: str
-    synthesizer_model: str
-    trainee_model: str
-    api_key: str
-    chunk_size: int
-    rpm: int
-    tpm: int
-    quiz_samples: int
-    trainee_url: str
-    trainee_api_key: str
-    token_counter: Any
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class WebuiParams:
+    """
+    GraphGen parameters
+    """
+
+    if_trainee_model: bool
+    upload_file: Any  # gr.File
+    tokenizer: str
+    synthesizer_model: str
+    synthesizer_url: str
+    trainee_model: str
+    trainee_url: str
+    api_key: str
+    trainee_api_key: str
+    chunk_size: int
+    chunk_overlap: int
+    quiz_samples: int
+    partition_method: str
+    dfs_max_units: int
+    bfs_max_units: int
+    leiden_max_size: int
+    leiden_use_lcc: bool
+    leiden_random_seed: int
+    ece_max_units: int
+    ece_min_units: int
+    ece_max_tokens: int
+    ece_unit_sampling: str
+    mode: str
+    data_format: str
+    rpm: int
+    tpm: int
+    token_counter: Any
diff --git a/webui/examples/chunked_demo.json b/webui/examples/chunked_demo.json
deleted file mode 100644
index ad7219a3..00000000
--- a/webui/examples/chunked_demo.json
+++ /dev/null
@@ -1,14 +0,0 @@
-[
-  [
-    {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
-  ],
-  [
-    {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
-  ],
-  [
-    {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
-  ],
-  [
-    {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
-  ]
-]
diff --git a/webui/examples/csv_demo.csv b/webui/examples/csv_demo.csv
new file mode 100644
index 00000000..e6c1b521
--- /dev/null
+++ b/webui/examples/csv_demo.csv
@@ -0,0 +1,5 @@
+type,content
+text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。
+text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。
+text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."
+text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."
diff --git a/webui/examples/json_demo.json b/webui/examples/json_demo.json
new file mode 100644
index 00000000..a53101da
--- /dev/null
+++ b/webui/examples/json_demo.json
@@ -0,0 +1,6 @@
+[
+  {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
+  {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"},
+  {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
+  {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
+]
diff --git a/webui/examples/jsonl_demo.jsonl b/webui/examples/jsonl_demo.jsonl
new file mode 100644
index 00000000..2b743578
--- /dev/null
+++ b/webui/examples/jsonl_demo.jsonl
@@ -0,0 +1,4 @@
+{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
+{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
+{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
+{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
diff --git a/webui/examples/raw_demo.jsonl b/webui/examples/raw_demo.jsonl
deleted file mode 100644
index 024559a6..00000000
--- a/webui/examples/raw_demo.jsonl
+++ /dev/null
@@ -1,4 +0,0 @@
-{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
-{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"}
-{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
-{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
diff --git a/webui/examples/vqa_demo.json b/webui/examples/vqa_demo.json
new file mode 100644
index 00000000..a53101da
--- /dev/null
+++ b/webui/examples/vqa_demo.json
@@ -0,0 +1,6 @@
+[
+  {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
+  {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"},
+  {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
+  {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
+]
diff --git a/webui/i18n.py b/webui/i18n.py
new file mode 100644
index 00000000..ce6bb40e
--- /dev/null
+++ b/webui/i18n.py
@@ -0,0 +1,395 @@
+# pylint: skip-file
+import functools
+import inspect
+import json
+import os
+from contextlib import contextmanager
+
+import gradio as gr
+import langcodes
+import yaml
+from gradio.blocks import Block, BlockContext, Context, LocalContext
+
+
+# Monkey patch to escape I18nString type being stripped in gradio.Markdown
+def escape_caller(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if args and isinstance(args[0], I18nString):
+            add_values = args[0].add_values
+            radd_values = args[0].radd_values
+            result = I18nString(func(*args, **kwargs))
+            result.add_values = add_values
+            result.radd_values = radd_values
+
+            return result
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+inspect.cleandoc = escape_caller(inspect.cleandoc)
+
+
+class TranslateContext:
+    available_languages = ["en"]
+    dictionary: dict = {}
+    lang_per_session = {}
+
+    def get_available_languages():
+        return TranslateContext.available_languages
+
+    def set_available_languages(langs: list):
+        if not langs or not isinstance(langs, list):
+            raise ValueError("langs must be a list of languages")
+        TranslateContext.available_languages = langs
+
+    def get_default_language():
+        return TranslateContext.get_available_languages()[0]
+
+    def add_translation(translation: dict):
+        for k, v in translation.items():
+            if k not in TranslateContext.available_languages:
+                continue
+            if k not in TranslateContext.dictionary:
+                TranslateContext.dictionary[k] = {}
+            TranslateContext.dictionary[k].update(v)
+
+    def get_current_language(request: gr.Request):
+        return TranslateContext.lang_per_session.get(
+            request.session_hash, TranslateContext.get_default_language()
+        )
+
+    def set_current_language(request: gr.Request, lang: str):
+        TranslateContext.lang_per_session[request.session_hash] = lang
+
+    def get_lang_from_request(request: gr.Request):
+        if "Accept-Language" not in request.headers:
+            return TranslateContext.get_default_language()
+
+        # Get the first language from the Accept-Language header
+        lang = request.headers["Accept-Language"].split(",")[0]
+        lang, _ = langcodes.closest_match(
+            lang, TranslateContext.get_available_languages()
+        )
+
+        if not lang or lang == "und":
+            return TranslateContext.get_default_language()
+        return lang
+
+
+class I18nString(str):
+    __slots__ = ("_key", "add_values", "radd_values")
+
+    def __new__(cls, value):
+        obj = super().__new__(cls, value)
+        obj._key = value
+        obj.add_values = []
+        obj.radd_values = []
+        return obj
+
+    def __str__(self):
+        try:
+            request = LocalContext.request.get()
+        except LookupError:
+            request = None
+        if request is None:
+            return self._key
+
+        lang = TranslateContext.get_current_language(request)
+        result = TranslateContext.dictionary.get(lang, {}).get(self._key, self._key)
+
+        for v in self.radd_values:
+            result = str(v) + result
+        for v in self.add_values:
+            result = result + str(v)
+
+        while len(result) >= 2 and result.startswith("'") and result.endswith("'"):
+            result = result[1:-1]
+        return result
+
+    def __add__(self, other):
+        self.add_values.append(other)
+        return self
+
+    def __radd__(self, other):
+        self.radd_values.append(other)
+        return self
+
+    def __hash__(self) -> int:
+        return super().__hash__()
+
+    def format(self, *args, **kwargs) -> str:
+        v = str(self)
+        if isinstance(v, I18nString):
+            return super().format(*args, **kwargs)
+        return v.format(*args, **kwargs)
+
+    def unwrap(self):
+        return super().__str__()
+
+    @staticmethod
+    def unwrap_strings(obj):
+        """Unwrap all keys in I18nStrings in the object"""
+        if isinstance(obj, I18nString):
+            yield obj.unwrap()
+            for v in obj.add_values:
+                yield from I18nString.unwrap_strings(v)
+            for v in obj.radd_values:
+                yield from I18nString.unwrap_strings(v)
+            return
+        yield obj
+
+
+def gettext(key: str):
+    """Wrapper text string to return I18nString
+    :param key: The key of the I18nString
+    """
+    return I18nString(key)
+
+
+def iter_i18n_choices(choices):
+    """Iterate all I18nStrings in the choice, returns the indices of the I18nStrings"""
+    if not isinstance(choices, list) or len(choices) == 0:
+        return
+
+    if isinstance(choices[0], tuple):
+        for i, (k, v) in enumerate(choices):
+            if isinstance(k, I18nString):
+                yield i
+
+    else:
+        for i, v in enumerate(choices):
+            if isinstance(v, I18nString):
+                yield i
+
+
+def iter_i18n_fields(component: gr.components.Component):
+    """Iterate all I18nStrings in the component"""
+    for name, value in inspect.getmembers(component):
+        if name == "value" and hasattr(component, "choices"):
+            # for those components with choices, the value will be kept as is
+            continue
+        if isinstance(value, I18nString):
+            yield name
+        elif name == "choices" and any(iter_i18n_choices(value)):
+            yield name
+
+
+def iter_i18n_components(block: Block):
+    """Iterate all I18nStrings in the block"""
+    if isinstance(block, BlockContext):
+        for component in block.children:
+            for c in iter_i18n_components(component):
+                yield c
+
+    if any(iter_i18n_fields(block)):
+        yield block
+
+
+def has_new_i18n_fields(block: Block, existing_translation={}):
+    """Check if there are new I18nStrings in the block
+    :param block: The block to check
+    :param existing_translation: The existing translation dictionary
+    :return: True if there are new I18nStrings, False otherwise
+    """
+    components = list(iter_i18n_components(block))
+    for lang in TranslateContext.get_available_languages():
+        for component in components:
+            for field in iter_i18n_fields(component):
+                if field == "choices":
+                    for idx in iter_i18n_choices(component.choices):
+                        if isinstance(component.choices[idx], tuple):
+                            value = component.choices[idx][0]
+                        else:
+                            value = component.choices[idx]
+                        if value not in existing_translation.get(lang, {}):
+                            return True
+                else:
+                    value = getattr(component, field)
+                    if value not in existing_translation.get(lang, {}):
+                        return True
+
+    return False
+
+
+def dump_blocks(block: Block, include_translations={}):
+    """Dump all I18nStrings in the block to a dictionary
+    :param block: The block to dump
+    :param include_translations: The existing translation dictionary
+    :return: The dumped dictionary
+    """
+    components = list(iter_i18n_components(block))
+
+    def translate(lang, key):
+        return include_translations.get(lang, {}).get(key, key)
+
+    ret = {}
+
+    for lang in TranslateContext.get_available_languages():
+        ret[lang] = {}
+        for component in components:
+            for field in iter_i18n_fields(component):
+                if field == "choices":
+                    for idx in iter_i18n_choices(component.choices):
+                        if isinstance(component.choices[idx], tuple):
+                            value = component.choices[idx][0]
+                        else:
+                            value = component.choices[idx]
+                        for key in I18nString.unwrap_strings(value):
+                            ret[lang][key] = translate(lang, key)
+                else:
+                    value = getattr(component, field)
+                    for key in I18nString.unwrap_strings(value):
+                        ret[lang][key] = translate(lang, key)
+
+    return ret
+
+
+def translate_blocks(
+    block: gr.Blocks = None,
+    translation={},
+    lang: gr.components.Component = None,
+    persistant=False,
+):
+    """Translate all I18nStrings in the block
+    :param block: The block to translate, default is the root block
+    :param translation: The translation dictionary
+    :param lang: The language component to change the language
+    :param persistant: Whether to persist the language
+    """
+    if block is None:
+        block = Context.root_block
+
+    """Translate all I18nStrings in the block"""
+    if not isinstance(block, gr.Blocks):
+        raise ValueError("block must be an instance of gradio.Blocks")
+
+    components = list(iter_i18n_components(block))
+    TranslateContext.add_translation(translation)
+
+    hidden = gr.HTML(
+        value="""<style>
+                    gradio-app {
+                        visibility: hidden;
+                    }
+                     </style>"""
+    )
+
+    if persistant:
+        try:
+            from gradio import BrowserState
+        except ImportError:
+            raise ValueError("gradio>=5.6.0 is required for persistant language")
+
+    def on_lang_change(request: gr.Request, lang: str, saved_lang: str):
+        if not lang:
+            if saved_lang:
+                lang = saved_lang
+            else:
+                lang = TranslateContext.get_lang_from_request(request)
+
+        outputs = [lang, lang, ""]
+        TranslateContext.set_current_language(request, lang)
+
+        for component in components:
+            fields = list(iter_i18n_fields(component))
+            if component == lang and "value" in fields:
+                raise ValueError("'lang' component can't has I18nStrings as value")
+
+            modified = {}
+
+            for field in fields:
+                if field == "choices":
+                    choices = component.choices.copy()
+                    for idx in iter_i18n_choices(choices):
+                        if isinstance(choices[idx], tuple):
+                            k, v = choices[idx]
+                            # We don't need to translate the value
+                            choices[idx] = (str(k), next(I18nString.unwrap_strings(v)))
+                        else:
+                            v = choices[idx]
+                            choices[idx] = (str(v), next(I18nString.unwrap_strings(v)))
+                    modified[field] = choices
+                else:
+                    modified[field] = str(getattr(component, field))
+
+            new_comp = gr.update(**modified)
+            outputs.append(new_comp)
+
+        if len(outputs) == 1:
+            return outputs[0]
+
+        return outputs
+
+    if lang is None:
+        lang = gr.State()
+
+    if persistant:
+        saved_lang = gr.BrowserState(storage_key="lang")
+    else:
+        saved_lang = gr.State()
+
+    gr.on(
+        [block.load, lang.change],
+        on_lang_change,
+        inputs=[lang, saved_lang],
+        outputs=[lang, saved_lang, hidden] + components,
+    )
+
+
+@contextmanager
+def Translate(
+    translation,
+    lang: gr.components.Component = None,
+    placeholder_langs=[],
+    persistant=False,
+):
+    """Translate all I18nStrings in the block
+    :param translation: The translation dictionary or file path
+    :param lang: The language component to change the language
+    :param placeholder_langs: The placeholder languages to create a new translation file if translation is a file path
+    :param persistant: Whether to persist the language
+    :return: The language component
+    """
+    if lang is None:
+        lang = gr.State()
+    yield lang
+
+    if isinstance(translation, dict):
+        # Static translation
+        translation_dict = translation
+        pass
+    elif isinstance(translation, str):
+        if os.path.exists(translation):
+            # Regard as a file path
+            with open(translation, "r", encoding="utf-8") as f:  # Force utf-8 encoding
+                if translation.endswith(".json"):
+                    translation_dict = json.load(f)
+                elif translation.endswith(".yaml"):
+                    translation_dict = yaml.safe_load(f)
+                else:
+                    raise ValueError("Unsupported file format")
+        else:
+            translation_dict = {}
+    else:
+        raise ValueError("Unsupported translation type")
+
+    if placeholder_langs:
+        TranslateContext.set_available_languages(placeholder_langs)
+
+    block = Context.block
+    translate_blocks(
+        block=block, translation=translation_dict, lang=lang, persistant=persistant
+    )
+
+    if isinstance(translation, str) and has_new_i18n_fields(
+        block, existing_translation=translation_dict
+    ):
+        merged = dump_blocks(block, include_translations=translation_dict)
+
+        with open(translation, "w") as f:
+            if translation.endswith(".json"):
+                json.dump(merged, f, indent=2, ensure_ascii=False)
+            elif translation.endswith(".yaml"):
+                yaml.dump(merged, f, allow_unicode=True, sort_keys=False)
diff --git a/webui/translation.json b/webui/translation.json
index fef5d579..320482ad 100644
--- a/webui/translation.json
+++ b/webui/translation.json
@@ -1,36 +1,112 @@
 {
   "en": {
-    "Title": "✨Easy-to-use LLM Training Data Generation Framework✨",
-    "Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
-    "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud",
+    "Title": "# ✨Easy-to-use LLM Training Data Generation Framework✨\n\n",
+    "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
+    "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disabled for SiliconCloud",
     "Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
-    "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default",
     "Synthesizer Model Info": "Model for constructing KGs and generating QAs",
+    "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default",
     "Trainee Model Info": "Model for training",
+    "SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
     "Model Config": "Model Configuration",
-    "Generation Config": "Generation Config",
-    "SiliconCloud Token": "SiliconCloud API Key",
-    "SiliconCloud Token for Trainee Model": "SiliconCloud API Key for Trainee Model",
-    "Test Connection": "Test Connection",
-    "Run GraphGen": "Run GraphGen",
+    "SiliconFlow Token Info": "Get SiliconFlow API Key at \"https://cloud.siliconflow.cn/account/ak\", efficiently and stably use LLM interfaces",
+    "SiliconFlow Token": "SiliconFlow API Key",
     "Upload File": "Upload File",
-    "Example Files": "Example Files"
+    "Example Files": "Example Files",
+    "File Preview": "File Preview",
+    "Split Config Info": "If the input text is a long text without chunks, the system will split the text into appropriate paragraphs based on the following parameters.",
+    "Chunk Size Info": "Split the long text according to this value. Too short will lead to incomplete knowledge, and too long will lead to LLM input being too long",
+    "Chunk Size": "chunk_size(Chunk Size)",
+    "Chunk Overlap Info": "The overlapping part between two adjacent chunks, which can help maintain context continuity",
+    "Chunk Overlap": "chunk_overlap(Chunk Overlap)",
+    "Split Config": "Split Config",
+    "Quiz & Judge Config Info": "Synthesizer Model generates quiz questions based on each knowledge unit in the knowledge graph to assess the Trainee Model's understanding of the knowledge and obtain comprehension loss.",
+    "Quiz Samples Info": "Configure how many quiz questions to generate for each knowledge unit",
+    "Quiz Samples": "quiz_samples(Quiz Samples)",
+    "Quiz & Judge Config": "Quiz & Judge Config",
+    "Partition Config Info": "Partition the knowledge graph into multiple communities (subgraphs), each community is the smallest unit for generating QAs. Appropriate partitioning methods can improve relevance and diversity.",
+    "Which algorithm to use for graph partitioning.": "Which algorithm to use for graph partitioning.",
+    "Partition Method": "method(Partition Method)",
+    "DFS intro": "The DFS partitioning method uses a depth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring as deeply as possible along connected units until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.",
+    "Max Units Per Community Info": "The maximum number of knowledge units (nodes) allowed in each community. If a community exceeds this limit, it will be further partitioned. A unit refers to a node in the knowledge graph, which can be an entity or a relation.",
+    "Max Units Per Community": "max_units_per_community(Max Units Per Community)",
+    "BFS intro": "The BFS partitioning method uses a breadth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring all its neighboring units before moving on to the neighbors' neighbors. This process continues until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.",
+    "Leiden intro": "The Leiden partitioning method is a community detection algorithm based on modularity optimization, designed to identify tightly connected subgraphs within a graph. The algorithm iteratively optimizes the assignment of nodes to communities, maximizing the density of connections within communities while minimizing connections between communities. The Leiden algorithm can effectively handle large-scale graph data and typically produces higher-quality community partitions compared to other community detection algorithms, such as the Louvain algorithm.",
+    "Maximum Size of Communities Info": "The maximum number of nodes allowed in a community. If a community exceeds this limit, it will be further partitioned.",
+    "Maximum Size of Communities": "max_size(Maximum Size of Communities)",
+    "Use Largest Connected Component Info": "The largest connected component refers to the largest subset of nodes in a graph where there is a path connecting any two nodes. When this option is enabled, the partitioning algorithm will only consider the largest connected component of the knowledge graph for community partitioning, ignoring other smaller connected components. This helps ensure that the generated communities have higher connectivity and relevance.",
+    "Use Largest Connected Component": "use_lcc(Use Largest Connected Component)",
+    "Random Seed Info": "The random seed changes the initial state of the graph partitioning, thereby affecting the partitioning results. By setting different random seeds, different community partitioning schemes can be generated, which helps improve the diversity of generated QAs.",
+    "Random Seed": "random_seed(Random Seed)",
+    "ECE intro": "ECE is an original graph partitioning method based on the principle of model calibration. It evaluates the performance of each unit under the current model by computing its calibration error (referred to as the comprehension loss) and partitions the graph according to this comprehension error.",
+    "Min Units Per Community Info": "Limit the minimum number of nodes allowed in each community. If a community has fewer nodes than this limit, it will be discarded.",
+    "Min Units Per Community": "min_units_per_community(Min Units Per Community)",
+    "Max Tokens Per Community Info": "The maximum number of tokens allowed in each community. If a community exceeds this limit, it will be further partitioned.",
+    "Max Tokens Per Community": "max_tokens_per_community(Max Tokens Per Community)",
+    "Unit Sampling Strategy Info": "Unit sampling strategy determines how to select units from candidate units when constructing communities. Unit sampling strategies include random, max_loss, and min_loss. random means selecting units randomly, max_loss means prioritizing units with higher comprehension loss, and min_loss means prioritizing units with lower comprehension loss.\n\n(Note: Only when the Trainee Model is activated and evaluated will there be comprehension loss, allowing the use of max_loss and min_loss strategies; otherwise, only the random strategy can be used.)",
+    "Unit Sampling Strategy": "unit_sampling(Unit Sampling Strategy)",
+    "Partition Config": "Knowledge Graph Partition Config",
+    "Generation Config Info": "Generation configuration includes generation mode and output data format.",
+    "Mode Info": "Includes various generation modes such as atomic, aggregated, multi-hop, and chain-of-thought, suitable for tasks of different complexity.",
+    "Mode": "mode(Mode)",
+    "Output Data Format Info": "Includes various output formats such as Alpaca, Sharegpt, and ChatML.",
+    "Output Data Format": "data_format(Output Data Format)",
+    "Generation Config": "Generation Config",
+    "Output File": "Output File"
   },
   "zh": {
-    "Title": "✨开箱即用的LLM训练数据生成框架✨",
-    "Intro": "是一个基于知识图谱的数据合成框架，旨在知识密集型任务中生成问答。\n\n 上传你的文本块（如农业、医疗、海洋知识），填写 LLM api key，即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
+    "Title": "# ✨开箱即用的LLM训练数据生成框架✨\n\n",
+    "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) 是一个基于知识图谱的数据合成框架，旨在知识密集型任务中生成问答。\n\n 上传你的文本块（如农业、医疗、海洋知识），填写 LLM api key，即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
     "Use Trainee Model": "使用Trainee Model来识别知识盲区，使用硅基流动时请保持禁用",
     "Synthesizer URL Info": "调用合成模型API的URL，默认使用硅基流动",
-    "Trainee URL Info": "调用学生模型API的URL，默认使用硅基流动",
     "Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
+    "Trainee URL Info": "调用学生模型API的URL，默认使用硅基流动",
     "Trainee Model Info": "用于训练的模型",
+    "SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
     "Model Config": "模型配置",
-    "Generation Config": "生成配置",
-    "SiliconCloud Token": "硅基流动 API Key",
-    "SiliconCloud Token for Trainee Model": "硅基流动 API Key （学生模型）",
-    "Test Connection": "测试接口",
-    "Run GraphGen": "运行GraphGen",
+    "SiliconFlow Token Info": "在 \"https://cloud.siliconflow.cn/account/ak\" 获取硅基流动 API 秘钥, 使用高效稳定的 LLM 接口",
+    "SiliconFlow Token": "硅基流动 API 秘钥",
     "Upload File": "上传文件",
-    "Example Files": "示例文件"
+    "Example Files": "示例文件",
+    "File Preview": "文件预览",
+    "Split Config Info": "如果输入文本是未分块的长文本，系统会根据以下参数将文本分成合适的段落。",
+    "Chunk Size Info": "按照该值将分割长文本，太短会导致知识不完整，太长会导致 LLM 输入过长",
+    "Chunk Size": "chunk_size(分割大小)",
+    "Chunk Overlap Info": "两个相邻块之间的重叠部分，有助于保持上下文的连续性",
+    "Chunk Overlap": "chunk_overlap(分割重叠大小)",
+    "Split Config": "文本分割配置",
+    "Quiz & Judge Config Info": "合成模型根据知识图谱中的每个知识单元，生成判断题，用于评估学生模型对知识的理解程度，得到理解误差。",
+    "Quiz Samples Info": "配置每个知识单元生成多少判断题",
+    "Quiz Samples": "quiz_samples(Quiz Samples)",
+    "Quiz & Judge Config": "测试与评判配置",
+    "Partition Config Info": "将知识图谱划分为多个社区（子图），每个社区是生成问答的最小单位。合适的分区方法可以提高关联性和多样性。",
+    "Which algorithm to use for graph partitioning.": "选择用于图划分的算法。",
+    "Partition Method": "method(划分方法)",
+    "DFS intro": "DFS划分方法使用深度优先搜索算法遍历知识图谱，从一个单元开始，沿着与之连接的单元深入探索，直到达到预设的社区大小或没有更多未访问的单元为止。然后，它会从另一个未访问的单元开始新的社区，重复这一过程，直到所有单元都被分配到社区中。",
+    "Max Units Per Community Info": "每个社区允许的知识单元（节点）的最大数量。如果一个社区超过这个限制，它将被进一步划分。一个单元指的是知识图谱中的一个节点，可以是实体或关系。",
+    "Max Units Per Community": "max_units_per_community(每个社区的最大单元数)",
+    "BFS intro": "BFS划分方法使用广度优先搜索算法遍历知识图谱，从一个单元开始，探索所有与之直接连接的单元，然后再从这些单元出发，继续探索它们的直接连接单元。这个过程会持续直到达到预设的社区大小或没有更多未访问的单元为止。然后，它会从另一个未访问的单元开始新的社区，重复这一过程，直到所有单元都被分配到社区中。",
+    "Leiden intro": "Leiden划分方法是一种基于模块度优化的社区检测算法，旨在识别图中的紧密连接子图。该算法通过迭代地优化节点的社区分配，最大化社区内的连接密度，同时最小化社区间的连接。Leiden算法能够有效处理大规模图数据，并且通常比其他社区检测算法（如Louvain算法）产生更高质量的社区划分结果。",
+    "Maximum Size of Communities Info": "一个社区中允许的最大节点数量。如果一个社区的节点数超过这个限制，它将被进一步划分。",
+    "Maximum Size of Communities": "max_size(社区的最大尺寸)",
+    "Use Largest Connected Component Info": "最大连通分量是指在图中节点之间存在路径连接的最大子集。启用此选项后，划分算法将仅考虑知识图谱中的最大连通分量进行社区划分，忽略其他较小的连通分量。这有助于确保生成的社区具有更高的连通性和相关性。",
+    "Use Largest Connected Component": "use_lcc(使用最大连通分量)",
+    "Random Seed Info": "随机种子改变图划分的初始状态，从而影响划分结果。通过设置不同的随机种子，可以生成不同的社区划分方案，有助于提高生成问答的多样性。",
+    "Random Seed": "random_seed(随机种子)",
+    "ECE intro": "ECE是一种基于模型校准原理的原创图划分方法。ECE通过计算单元的校准误差来评估其在当前模型下的表现(记为理解误差)，并根据理解误差对图进行划分。",
+    "Min Units Per Community Info": "限制每个社区中允许的最小节点数量。如果一个社区的节点数少于这个限制，它将被舍弃。",
+    "Min Units Per Community": "min_units_per_community(每个社区的最小单元数)",
+    "Max Tokens Per Community Info": "每个社区允许的最大Token数量。如果一个社区的Token数超过这个限制，它将被进一步划分。",
+    "Max Tokens Per Community": "max_tokens_per_community(每个社区的最大Token数)",
+    "Unit Sampling Strategy Info": "单元采样策略决定在构建社区的时候如何从候选单元中选择单元。单元采样策略包括 random, max_loss, min_loss。 random表示随机选择单元，max_loss表示优先选择理解误差较大的单元，min_loss表示优先选择理解误差较小的单元。\n\n(注意：只有当学生模型启动时，经过评测后，才会有理解误差，才能使用 max_loss 和 min_loss 策略，否则只能使用 random 策略)",
+    "Unit Sampling Strategy": "unit_sampling(单元采样策略)",
+    "Partition Config": "知识图谱分区配置",
+    "Generation Config Info": "生成配置包括生成模式和输出数据格式。",
+    "Mode Info": "包括原子、聚合、多跳、思维链等多种生成模式，适用于不同复杂度的任务。",
+    "Mode": "mode(生成模式)",
+    "Output Data Format Info": "包括 Alpaca, Sharegpt, ChatML等多种输出格式。",
+    "Output Data Format": "data_format(输出数据格式)",
+    "Generation Config": "生成配置",
+    "Output File": "输出文件"
   }
 }
\ No newline at end of file
diff --git a/webui/utils/__init__.py b/webui/utils/__init__.py
new file mode 100644
index 00000000..07e5d386
--- /dev/null
+++ b/webui/utils/__init__.py
@@ -0,0 +1,3 @@
+from .cache import cleanup_workspace, setup_workspace
+from .count_tokens import count_tokens
+from .preview_file import preview_file
diff --git a/webui/cache_utils.py b/webui/utils/cache.py
similarity index 99%
rename from webui/cache_utils.py
rename to webui/utils/cache.py
index 96c7d4d9..0c9412f2 100644
--- a/webui/cache_utils.py
+++ b/webui/utils/cache.py
@@ -1,6 +1,7 @@
 import os
-import uuid
 import shutil
+import uuid
+
 
 def setup_workspace(folder):
     request_id = str(uuid.uuid4())
diff --git a/webui/count_tokens.py b/webui/utils/count_tokens.py
similarity index 61%
rename from webui/count_tokens.py
rename to webui/utils/count_tokens.py
index 53bed59a..82b5522c 100644
--- a/webui/count_tokens.py
+++ b/webui/utils/count_tokens.py
@@ -1,6 +1,7 @@
+import json
 import os
 import sys
-import json
+
 import pandas as pd
 
 # pylint: disable=wrong-import-position
@@ -8,24 +9,29 @@
 sys.path.append(root_dir)
 from graphgen.models import Tokenizer
 
+
 def count_tokens(file, tokenizer_name, data_frame):
     if not file or not os.path.exists(file):
         return data_frame
 
     if file.endswith(".jsonl"):
-        with open(file, "r", encoding='utf-8') as f:
+        with open(file, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
     elif file.endswith(".json"):
-        with open(file, "r", encoding='utf-8') as f:
+        with open(file, "r", encoding="utf-8") as f:
             data = json.load(f)
             data = [item for sublist in data for item in sublist]
     elif file.endswith(".txt"):
-        with open(file, "r", encoding='utf-8') as f:
+        with open(file, "r", encoding="utf-8") as f:
             data = f.read()
-            chunks = [
-                data[i:i + 512] for i in range(0, len(data), 512)
-            ]
+            chunks = [data[i : i + 512] for i in range(0, len(data), 512)]
             data = [{"content": chunk} for chunk in chunks]
+    elif file.endswith(".csv"):
+        df = pd.read_csv(file)
+        if "content" in df.columns:
+            data = df["content"].tolist()
+        else:
+            data = df.iloc[:, 0].tolist()
     else:
         raise ValueError(f"Unsupported file type: {file}")
 
@@ -39,22 +45,15 @@ def count_tokens(file, tokenizer_name, data_frame):
             content = item.get("content", "")
         else:
             content = item
-        token_count += len(tokenizer.encode_string(content))
+        token_count += len(tokenizer.encode(content))
 
-    _update_data = [[
-        str(token_count),
-        str(token_count * 50),
-        "N/A"
-    ]]
+    _update_data = [[str(token_count), str(token_count * 50), "N/A"]]
 
     try:
-        new_df = pd.DataFrame(
-            _update_data,
-            columns=data_frame.columns
-        )
+        new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
         data_frame = new_df
 
-    except Exception as e: # pylint: disable=broad-except
+    except Exception as e:  # pylint: disable=broad-except
         print("[ERROR] DataFrame操作异常:", str(e))
 
     return data_frame
diff --git a/webui/utils/preview_file.py b/webui/utils/preview_file.py
new file mode 100644
index 00000000..aba531c2
--- /dev/null
+++ b/webui/utils/preview_file.py
@@ -0,0 +1,29 @@
+import codecs
+import os
+
+import gradio as gr
+import pandas as pd
+
+
+def preview_file(file):
+    if file is None:
+        return gr.update(visible=False), gr.update(visible=False)
+
+    path = file.name
+    ext = os.path.splitext(path)[1].lower()
+
+    try:
+        if ext == ".csv":
+            df = pd.read_csv(path, nrows=10)
+            return gr.update(visible=False), gr.update(value=df, visible=True)
+        with codecs.open(path, "r", encoding="utf-8") as f:
+            text = f.read(5000)
+            if len(text) == 5000:
+                text += "\n\n... (truncated at 5000 chars)"
+        return gr.update(
+            value=text, visible=True, language="json" if ext != ".txt" else None
+        ), gr.update(visible=False)
+    except Exception as e:  # pylint: disable=broad-except
+        return gr.update(
+            value=f"Preview failed: {e}", visible=True, language=None
+        ), gr.update(visible=False)

Protease per milliter	Tissue collapse	Harpin detected
0	+	+
5μg	+	+
10μg	+	+
20 μg	Weak	+
40 μg	-
80μg	一
80μg + 0.5 mM PMSF	+	+
Cell-free supernatant