Skip to content

Commit cffaf5d

Browse files
committed
add: allow search result as input for omics KG
1 parent 7d86271 commit cffaf5d

File tree

7 files changed

+226
-0
lines changed

7 files changed

+226
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
python3 -m graphgen.run \
2+
--config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml \
3+
--output_dir cache/
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
global_params:
2+
working_dir: cache
3+
graph_backend: kuzu # graph database backend, support: kuzu, networkx
4+
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
5+
6+
nodes:
7+
- id: read_files
8+
op_name: read
9+
type: source
10+
dependencies: []
11+
params:
12+
input_path:
13+
# Use pre-searched data files (skip search step)
14+
# The search_service will automatically detect and skip search if data already contains search results
15+
- examples/input_examples/searched_dna_demo.jsonl
16+
- examples/input_examples/searched_rna_demo.jsonl
17+
- examples/input_examples/searched_protein_demo.jsonl
18+
19+
- id: search_data
20+
op_name: search
21+
type: map_batch
22+
dependencies:
23+
- read_files
24+
execution_params:
25+
replicas: 1
26+
batch_size: 10
27+
# Note: search_service will automatically detect pre-searched data and skip search,
28+
# but it will still normalize the data format (ensure _doc_id, content, data_source fields exist)
29+
30+
- id: chunk_documents
31+
op_name: chunk
32+
type: map_batch
33+
dependencies:
34+
- search_data
35+
execution_params:
36+
replicas: 4
37+
params:
38+
chunk_size: 1024 # chunk size for text splitting
39+
chunk_overlap: 100 # chunk overlap for text splitting
40+
sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein)
41+
sequence_chunk_overlap: 100
42+
43+
- id: build_kg
44+
op_name: build_kg
45+
type: map_batch
46+
dependencies:
47+
- chunk_documents
48+
execution_params:
49+
replicas: 1
50+
batch_size: 128
51+
52+
- id: partition
53+
op_name: partition
54+
type: aggregate
55+
dependencies:
56+
- build_kg
57+
params:
58+
method: anchor_bfs # partition method
59+
method_params:
60+
anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
61+
max_units_per_community: 10 # max nodes and edges per community
62+
63+
- id: generate
64+
op_name: generate
65+
type: map_batch
66+
dependencies:
67+
- partition
68+
execution_params:
69+
replicas: 1
70+
batch_size: 128
71+
params:
72+
method: omics_qa # unified QA generation method for DNA/RNA/Protein
73+
data_format: ChatML # Alpaca, Sharegpt, ChatML
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
2+
{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"}
3+
{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"}

0 commit comments

Comments
 (0)