-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrun.py
More file actions
89 lines (64 loc) · 2.81 KB
/
run.py
File metadata and controls
89 lines (64 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pickle
from nalaf.utils.download import DownloadArticle
from nalaf.utils.readers import StringReader, PMIDReader
from loctext.learning.annotators import StringTagger
from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, repo_path
from loctext.learning.annotators import LocTextDXModelRelationExtractor
RE_MODEL_PATH = repo_path("resources", "models", "D0_9606,3702,4932_1497520729.163767.bin")
RE_FEATURES_PATH = repo_path("resources", "features", "selected", "0_True_LinearSVC_C=2.0-1487943476.673364-NAMES.py")
RE_MODEL_BIN = None
with open(RE_MODEL_PATH, "rb") as f:
RE_MODEL_BIN = pickle.load(f)
def parse_arguments(argv=[]):
import argparse
parser = argparse.ArgumentParser(description='Run LocText on some text to extract Protein<-->Cell Compartments relations')
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument("--text",
help="Run against given text/string")
input_group.add_argument("--pmid",
help="Run against the abstract of the given PubMed ID (PMID), downloaded from NCBI")
input_group.add_argument("--entity_tagger_url",
default="http://127.0.0.1:5000",
help="URL (include host and port) of the dockerized STRING Tagger server")
args = parser.parse_args()
return args
def read_models(args):
# Note, the id constants for the entities and relations (*_ID) are arbitrary. Nonetheless, you must know them to
# later extract your desired types of entities/relations
ner = StringTagger(PRO_ID, LOC_ID, ORG_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, host=args.entity_tagger_url)
re = LocTextDXModelRelationExtractor(
PRO_ID, LOC_ID, REL_PRO_LOC_ID,
sentence_distance=0,
selected_features_file=RE_FEATURES_PATH,
use_predicted_entities=True,
model=RE_MODEL_BIN,
#
preprocess=True,
#
class_weight=None,
kernel='linear',
C=1,
)
return (ner, re)
def run_with_argv(argv=[]):
args = parse_arguments(argv)
ner, re = read_models(args)
if args.text:
corpus = StringReader(args.text).read()
elif args.pmid:
corpus = PMIDReader(args.pmid).read()
# See more possible readers including some NCBI XML files in `nalaf.utils.readers`
ner.annotate(corpus)
re.annotate(corpus)
return corpus
if __name__ == "__main__":
import sys
annotated_corpus = run_with_argv(sys.argv[1:])
print()
print("# Predicted entities:")
for entity in annotated_corpus.predicted_entities():
print(entity)
print()
print("# Predicted relations:")
for relation in annotated_corpus.predicted_relations():
print(relation)