Merge branch 'main' into feature/protein-qa

CHERRY-ui8 · web-flow · commit 52c23f06fa6d · 2025-12-17T23:49:28.000+08:00
diff --git a/README.md b/README.md
@@ -62,13 +62,16 @@ After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LL
 
 ## 📌 Latest Updates
 
-- **2025.12.1**: Added search support for [NCBI](https://www.ncbi.nlm.nih.gov/) and [RNAcentral](https://rnacentral.org/) databases, enabling extraction of DNA and RNA data from these bioinformatics databases.
-- **2025.10.30**: We support several new LLM clients and inference backends including [Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) and [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py).
-- **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`.
+- **2025.12.16**: Added [rocksdb](https://github.com/facebook/rocksdb) for key-value storage backend and [kuzudb](https://github.com/kuzudb/kuzu) for graph database backend support.
+- **2025.12.16**: Added [vllm](https://github.com/vllm-project/vllm) for local inference backend support.
+- **2025.12.16**: Refactored the data generation pipeline using [ray](https://github.com/ray-project/ray) to improve the efficiency of distributed execution and resource management.
 
 <details>
 <summary>History</summary>
 
+- **2025.12.1**: Added search support for [NCBI](https://www.ncbi.nlm.nih.gov/) and [RNAcentral](https://rnacentral.org/) databases, enabling extraction of DNA and RNA data from these bioinformatics databases.
+- **2025.10.30**: We support several new LLM clients and inference backends including [Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) and [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py).
+- **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`.
 - **2025.10.21**: We support PDF as input format for data generation now via [MinerU](https://github.com/opendatalab/MinerU).
 - **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
 - **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.
@@ -84,13 +87,14 @@ We support various LLM inference servers, API servers, inference clients, input
 Users can flexibly configure according to the needs of synthetic data.
 
 
-| Inference Server                             | Api Server                                                                     | Inference Client                                           | Data Source                                                                                                                                                                                                                                                                           | Data Modal    | Data Type                                       |
-|----------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
-| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | Files(CSV, JSON, PDF, TXT, etc.)<br>Databases([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>Search Engines([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>Knowledge Graphs([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+| Inference Server                                                         | Api Server                                                                     | Inference Client                                           | Data Source                                                                                                                                                                                                                                                                           | Data Modal    | Data Type                                       |
+|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg]<br>[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | Files(CSV, JSON, PDF, TXT, etc.)<br>Databases([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>Search Engines([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>Knowledge Graphs([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
 
 <!-- links -->
 [hf]: https://huggingface.co/docs/transformers/index
 [sg]: https://docs.sglang.ai
+[vllm]: https://github.com/vllm-project/vllm
 [sif]: https://siliconflow.cn
 [oai]: https://openai.com
 [az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
@@ -106,6 +110,7 @@ Users can flexibly configure according to the needs of synthetic data.
 <!-- icons -->
 [hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
 [sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai
 [sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
 [oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
 [az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
diff --git a/README_zh.md b/README_zh.md
@@ -62,13 +62,16 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 在数据生成后，您可以使用[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) 和 [xtuner](https://github.com/InternLM/xtuner)对大语言模型进行微调。
 
 ## 📌 最新更新
-- **2025.12.1**：新增对 [NCBI](https://www.ncbi.nlm.nih.gov/) 和 [RNAcentral](https://rnacentral.org/) 数据库的检索支持，现在可以从这些生物信息学数据库中提取DNA和RNA数据。
-- **2025.10.30**：我们支持多种新的 LLM 客户端和推理后端，包括 [Ollama_client]([Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) 和 [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py)。
-- **2025.10.23**：我们现在支持视觉问答（VQA）数据生成。运行脚本：`bash scripts/generate/generate_vqa.sh`。
+- **2025.12.16**:新增 [rocksdb](https://github.com/facebook/rocksdb) 作为键值存储后端, [kuzudb](https://github.com/kuzudb/kuzu) 作为图数据库后端的支持。
+- **2025.12.16**:新增 [vllm](https://github.com/vllm-project/vllm) 作为本地推理后端的支持。
+- **2025.12.16**:使用 [ray](https://github.com/ray-project/ray) 重构了数据生成 pipeline，提升了分布式执行和资源管理的效率。
 
 <details>
 <summary>历史更新</summary>
 
+- **2025.12.1**：新增对 [NCBI](https://www.ncbi.nlm.nih.gov/) 和 [RNAcentral](https://rnacentral.org/) 数据库的检索支持，现在可以从这些生物信息学数据库中提取DNA和RNA数据。
+- **2025.10.30**：我们支持多种新的 LLM 客户端和推理后端，包括 [Ollama_client]([Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) 和 [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py)。
+- **2025.10.23**：我们现在支持视觉问答（VQA）数据生成。运行脚本：`bash scripts/generate/generate_vqa.sh`。
 - **2025.10.21**：我们现在通过 [MinerU](https://github.com/opendatalab/MinerU) 支持 PDF 作为数据生成的输入格式。
 - **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
 - **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。
@@ -82,13 +85,14 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 我们支持多种 LLM 推理服务器、API 服务器、推理客户端、输入文件格式、数据模态、输出数据格式和输出数据类型。
 可以根据合成数据的需求进行灵活配置。
 
-| 推理服务器                                        | API 服务器                                                                        | 推理客户端                                                      | 输入文件格式                                                                                                                                                                                                                                                   | 数据模态          | 输出数据类型                                          |
-|----------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
-| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | 文件(CSV, JSON, JSONL, PDF, TXT等)<br>数据库([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>搜索引擎([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>知识图谱([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+| 推理服务器                                                                    | API 服务器                                                                        | 推理客户端                                                      | 输入文件格式                                                                                                                                                                                                                                                   | 数据模态          | 输出数据类型                                          |
+|--------------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg]<br>[![vllm-icon]vllm][vllm] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | 文件(CSV, JSON, JSONL, PDF, TXT等)<br>数据库([![uniprot-icon]UniProt][uniprot], [![ncbi-icon]NCBI][ncbi], [![rnacentral-icon]RNAcentral][rnacentral])<br>搜索引擎([![bing-icon]Bing][bing], [![google-icon]Google][google])<br>知识图谱([![wiki-icon]Wikipedia][wiki]) | TEXT<br>IMAGE | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
 
 <!-- links -->
 [hf]: https://huggingface.co/docs/transformers/index
 [sg]: https://docs.sglang.ai
+[vllm]: https://github.com/vllm-project/vllm
 [sif]: https://siliconflow.cn
 [oai]: https://openai.com
 [az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
@@ -104,6 +108,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 <!-- icons -->
 [hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
 [sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[vllm-icon]: https://www.google.com/s2/favicons?domain=https://docs.vllm.ai
 [sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
 [oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
 [az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
diff --git a/examples/evaluate/evaluate.sh b/examples/evaluate/evaluate.sh
@@ -1,4 +1,3 @@
 python3 -m graphgen.evaluate --folder cache/data \
-                    --output cache/output \
                     --reward "OpenAssistant/reward-model-deberta-v3-large-v2,BAAI/IndustryCorpus2_DataRater" \
                     --uni MingZhong/unieval-sum \
diff --git a/examples/extract/extract_schema_guided/extract_schema_guided.sh b/examples/extract/extract_schema_guided/extract_schema_guided.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml \
---output_dir cache/
+--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml
diff --git a/examples/generate/generate_aggregated_qa/generate_aggregated.sh b/examples/generate/generate_aggregated_qa/generate_aggregated.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/generate/generate_aggregated_qa/aggregated_config.yaml \
---output_dir cache/
+--config_file examples/generate/generate_aggregated_qa/aggregated_config.yaml
diff --git a/examples/generate/generate_atomic_qa/generate_atomic.sh b/examples/generate/generate_atomic_qa/generate_atomic.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/generate/generate_atomic_qa/atomic_config.yaml \
---output_dir cache/
+--config_file examples/generate/generate_atomic_qa/atomic_config.yaml
diff --git a/examples/generate/generate_cot_qa/generate_cot.sh b/examples/generate/generate_cot_qa/generate_cot.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/generate/generate_cot_qa/cot_config.yaml \
---output_dir cache/
+--config_file examples/generate/generate_cot_qa/cot_config.yaml
diff --git a/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/generate/generate_multi_hop_qa/multi_hop_config.yaml \
---output_dir cache/
+--config_file examples/generate/generate_multi_hop_qa/multi_hop_config.yaml
diff --git a/examples/generate/generate_vqa/generate_vqa.sh b/examples/generate/generate_vqa/generate_vqa.sh
@@ -1,3 +1,2 @@
 python3 -m graphgen.run \
---config_file examples/generate/generate_vqa/vqa_config.yaml \
---output_dir cache/
+--config_file examples/generate/generate_vqa/vqa_config.yaml
diff --git a/graphgen/common/init_llm.py b/graphgen/common/init_llm.py
@@ -131,7 +131,7 @@ def create_llm(
             ray.get_actor(actor_name)
         except ValueError:
             print(f"Creating Ray actor for LLM {model_type} with backend {backend}.")
-            num_gpus = int(config.pop("num_gpus", 0))
+            num_gpus = float(config.pop("num_gpus", 0))
             actor = (
                 ray.remote(LLMServiceActor)
                 .options(
diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py
@@ -30,8 +30,8 @@ def __init__(
 
         engine_args = AsyncEngineArgs(
             model=model,
-            tensor_parallel_size=tensor_parallel_size,
-            gpu_memory_utilization=gpu_memory_utilization,
+            tensor_parallel_size=int(tensor_parallel_size),
+            gpu_memory_utilization=float(gpu_memory_utilization),
             trust_remote_code=kwargs.get("trust_remote_code", True),
             disable_log_stats=False,
         )
@@ -79,15 +79,15 @@ async def generate_answer(
 
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
-    ) -> List[Token]:
+        ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
-
         request_id = f"graphgen_topk_{uuid.uuid4()}"
 
         sp = self.SamplingParams(
             temperature=0,
             max_tokens=1,
             logprobs=self.topk,
+            prompt_logprobs=1,
         )
 
         result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
@@ -101,14 +101,22 @@ async def generate_topk_per_token(
 
         top_logprobs = final_output.outputs[0].logprobs[0]
 
-        tokens = []
+        candidate_tokens = []
         for _, logprob_obj in top_logprobs.items():
-            tok_str = logprob_obj.decoded_token
+            tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
             prob = float(math.exp(logprob_obj.logprob))
-            tokens.append(Token(tok_str, prob))
-
-        tokens.sort(key=lambda x: -x.prob)
-        return tokens
+            candidate_tokens.append(Token(tok_str, prob))
+
+        candidate_tokens.sort(key=lambda x: -x.prob)
+
+        if candidate_tokens:
+            main_token = Token(
+                text=candidate_tokens[0].text,
+                prob=candidate_tokens[0].prob,
+                top_candidates=candidate_tokens
+            )
+            return [main_token]
+        return []
 
     async def generate_inputs_prob(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
diff --git a/graphgen/run.py b/graphgen/run.py
@@ -74,21 +74,13 @@ def main():
         .joinpath("aggregated_config.yaml"),
         type=str,
     )
-    parser.add_argument(
-        "--output_dir",
-        help="Output directory for GraphGen.",
-        default=sys_path,
-        required=True,
-        type=str,
-    )
 
     args = parser.parse_args()
 
-    working_dir = args.output_dir
-
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
+    working_dir = config.get("global_params", {}).get("working_dir", "cache")
     unique_id = int(time.time())
     output_path = os.path.join(working_dir, "output", f"{unique_id}")
     set_working_dir(output_path)