Skip to content

Commit d922c6a

Browse files
committed
python script to create cdb
1 parent ea02073 commit d922c6a

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
import pandas as pd
3+
from medcat.config import Config
4+
from medcat.cdb_maker import CDBMaker
5+
6+
pd.options.mode.chained_assignment = None
7+
8+
csv_path = input("Enter specific SNOMED pre-cdb csv found in the path data/snomed: ")
9+
release = csv_path[-12:-4]
10+
11+
if not os.path.exists('models'):
12+
os.makedirs('models')
13+
print("Creating a 'models' folder to store model")
14+
15+
model_dir = './models/'
16+
output_cdb = model_dir + f"{release}_SNOMED_cdb.dat"
17+
csv = pd.read_csv(csv_path)
18+
19+
# Remove null values
20+
sctid_null_index = csv[csv['name'].isnull()].index.copy()
21+
csv['name'].iloc[sctid_null_index] = "N/A"
22+
23+
# Only filter acronyms for specific Semantic tags
24+
csv['acronym'] = csv[~csv['description_type_ids'].str.
25+
contains("assessment scale|"
26+
"core metadata concept|"
27+
"metadata|"
28+
"foundation metadata concept"
29+
"|OWL metadata concept")]['name'].str.\
30+
extract("([A-Z]{2,6}) - ", expand=True)
31+
32+
print("Cleaning acronyms...")
33+
for i, row in csv[(~csv['acronym'].isnull()) & (csv['name_status'] == 'A')][['name', 'acronym']].iterrows():
34+
if row['name'][0:len(row['acronym'])] == row['acronym']:
35+
csv['name'].iloc[i] = row['acronym']
36+
37+
print("acronyms complete")
38+
39+
csv = csv.drop_duplicates(keep='first').reset_index(drop=True)
40+
csv.pop('acronym')
41+
42+
43+
# Setup config
44+
config = Config()
45+
config.general['spacy_model'] = 'en_core_web_md'
46+
config.cdb_maker['remove_parenthesis'] = 1
47+
config.general['cdb_source_name'] = f'SNOMED_{release}'
48+
49+
maker = CDBMaker(config)
50+
51+
52+
# Create your CDB
53+
# Add more cdbs to the list
54+
csv_paths = [csv_path]
55+
cdb = maker.prepare_csvs(csv_paths, full_build=True)
56+
57+
# Add type_id pretty names to cdb
58+
cdb.addl_info['type_id2name'] = pd.Series(csv.description_type_ids.values, index=csv.type_ids.astype(str)).to_dict()
59+
cdb.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms.
60+
61+
# save model
62+
cdb.save(output_cdb)
63+
print(f"CDB Model saved successfully as: {output_cdb}")
64+
65+
66+
67+

0 commit comments

Comments
 (0)