Skip to content

Commit b955211

Browse files
authored
Merge pull request #7 from snakemake-workflows/fix/profile_and_localizing
fix: profile and localizing
2 parents 5443dad + 0c46ed2 commit b955211

File tree

10 files changed

+593
-310
lines changed

10 files changed

+593
-310
lines changed

config/config.yaml

Lines changed: 54 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,64 @@
1-
# NOTE: All paths should be fully qualified paths
1+
#Name your experiment here
2+
EXPERIMENT_NAME: "test"
23

3-
# Path to raw ligand data | DATABASE/DATASET/file
4-
INPUT_DIR: "/lustre/project/m2_jgu-smitt/data/raw"
4+
# NOTE: All paths are relative to the workflow directory (or --directory if specified)
55

6+
# Path to raw ligand data | DATABASE/DATASET/file
67
# if you want to manually upload target pdb file upload these to in a subfolder of the input dir called "/PDB/receptor"
78

8-
9-
# Path to output prepared target proteins
10-
PREPARED_DATA_DIR: "/lustre/project/m2_jgu-smitt/data/prepared"
11-
12-
# Path to energy minimized ligand files
13-
PREPARED_LIGAND_DIR: "/lustre/project/m2_jgu-smitt/data/minimized"
14-
15-
# Path to scratch directory
16-
TEMP_DATA_DIR: "/lustre/scratch/m2_jgu-smitt"
17-
18-
# Path where docking results are stored
19-
OUTPUT_DIR: "/lustre/project/m2_jgu-smitt/<FOLDER>"
20-
21-
# Number of best results to be displayed (0<value<=1: percentage )
9+
# Percentage of best results to be considered for display and re-screening (0<value<=1: percentage )
10+
#TODO: change the wording
2211
RESULT_NUMBER: "10"
2312

24-
# Specify cutoff value for rescreening
13+
# Specify cutoff value for rescreening (in kcal/mol)
2514
CUTOFF_VALUE: "-8"
2615

2716
#Specify name for local uploaded data
28-
# note: this will be ignored, if a 'DATABASE' (see below) is specified
17+
# NOTE: this will be ignored, if a 'DATABASE' (see below) is specified
2918
LOC_DATA: ["DATASET"]
3019

31-
#Path to folder which contains compounds
32-
# Here, a full qualified path should be indicated.
33-
# note: this will be ignored, if a 'DATABASE' (see below) is specified
34-
LOCAL_INPUT_DIR: "<LOCAL_INPUT_DIR>"
35-
36-
#Specify database to use ZINC usees and downloads compounds from ZINC database, others read local input from LOCAL_INPUT_DIR
37-
38-
DATABASE: ["ZINC"]
39-
40-
# First letter is the molecular weight bin - a measure of size - horizontal axis, left to right, online. A: 200 D, B: 250, C:300, D: 325, E:350, F: 375
41-
# Second letter is the logP bin - a measure of polarity - vertical axis, top to bottom, online.
42-
# The third letter is reactivity : A=anodyne. B=Bother (e.g. chromophores) C=clean (but pains ok), E=mild reactivity ok, G=reactive ok, I = hot chemistry ok
43-
# The fourth letter is purchasability: A and B = in stock, C = in stock via agent, D = make on demand, E = boutique (expensive), F=annotated (not for sale)
44-
# The fifth letter is pH range: R = ref (7.4), M = mid (near 7.4), L = low (around 6.4), H=high (around 8.4).
45-
# The sixth and last dimension is net molecular charge. Here we follow the convention of InChIkeys.
46-
# Thus. N = neutral, M = minus 1, L = minus 2 (or greater). O = plus 1, P = plus 2 (or greater).
20+
# Path to folder which contains compounds
21+
# Here, a full qualified path should be indicated.
22+
# NOTE: this will be ignored, if a 'DATABASE' (see above) is specified
23+
LOCAL_INPUT_DIR: ""
24+
25+
# Specify "ZINC" to obtain compounds from the ZINC database.
26+
# Otherwise read local input from the LOCAL_INPUT_DIR, above.
27+
#TODO: unlist DATABASE
28+
DATABASE: "ZINC"
29+
30+
# Specify a ZINC mirror site. Options are:
31+
# - files.docking.org
32+
# - ftp.uni-mainz.de/mirror/zink20/
33+
#ZINC_MIRROR: "ftp.uni-mainz.de/mirror/zink20/"
34+
ZINC_MIRROR: "files.docking.org"
35+
36+
# Select the part of the ZINC database for screening. This section follows the ZINC notation and is
37+
# outlined, here:
38+
# - the 1st letter is the molecular weight bin - a measure of size - horizontal axis,
39+
# left to right, as shown on the ZINC webpage. A: 200 D, B: 250, C:300, D: 325, E:350, F: 375
40+
# - the 2nd letter is the logP bin - a measure of polarity - vertical axis, top to bottom,
41+
# as shown on the ZINC webpage.
42+
# - the 3rd letter defines reactivity : A=anodyne, B=Bother (e.g. chromophores),
43+
# C=clean (but pains ok), E=mild reactivity ok, G=reactive ok, I = hot chemistry ok
44+
# - the 4th letter notes purchasability: A and B = in stock, C = in stock via agent,
45+
# D = make on demand, E = boutique (expensive), F=annotated (not for sale)
46+
# - the 5th letter defines pH range: R = ref (7.4), M = mid (near 7.4), L = low (around 6.4),
47+
# H=high (around 8.4).
48+
# - the 6th and last dimension is net molecular charge. Here we follow the convention of InChIkeys.
49+
# Thus. N = neutral, M = minus 1, L = minus 2 (or greater). O = plus 1, P = plus 2 (or greater).
4750

4851
ZINC_INPUT:
49-
WEIGHT: ["A", "B"] #["C","D","E","F","G"]
50-
LOGP: ["A"] # ,"D","E","F","G", "H","I","J"]
52+
WEIGHT: ["B", "C"] #["C","D","E","F","G"]
53+
LOGP: ["B", "C"] # ,"D","E","F","G", "H","I","J"]
5154
REACT: ["A"] #,"B"] # ,"C", "E", "G"]
52-
PURCHASE: ["A"] #, "B"] #, "C", "D", "E"]
53-
PH: ["M"]
54-
CHARGE: ["N"] # ,"M","O","L","P"]
55+
PURCHASE: ["B"] #, "B"] #, "C", "D", "E"]
56+
PH: ["M", "R"]
57+
CHARGE: ["P"] # ,"M","O","L","P"]
5558

56-
#In case you don't want to download tranches from ZINC based on the paramters given above, a ZINC subset can be choosen. Otherwise set subset as TRANCHES
57-
# ex.
58-
SUBSET: "<SUBSET_NAME>"
59+
# In case you don't want to download tranches from ZINC based on the paramters given above,
60+
# a ZINC subset can be choosen. Otherwise set subset as TRANCHES.
61+
SUBSET: "TRANCHES"
5962

6063
#Specify ENAMINE collection
6164
ENAMINE_INPUT:
@@ -66,23 +69,21 @@ ENAMINE_INPUT:
6669

6770
ENAMINE_URL: http://www.enamine.net/files/Stock_Screening_Collections/
6871

72+
# Specify whether rescreening is desired ("TRUE" or "FALSE")
73+
# Rescreening will be performed on the top results as specified by 'RESULT_NUMBER' and 'CUTOFF_VALUE'
74+
# for the targets specified in 'RESCREENING_TARGETS', below.
6975
RESCREENING: "FALSE"
7076

71-
# Specify target enzyme ID and chains format: ["PDB_ID, <CHAIN_1> <CHAIN_2]
72-
TARGETS: ["TARGET,A B C"]
77+
# Specify target PDB ID and chains in this format: ["PDB_ID, <CHAIN_1> <CHAIN_2], e.g.:
78+
TARGETS: ["7CWM, A B C"]
7379

7480
# to be specified, if 'RESCREENING' is desired (RESCREENING: "TRUE")
75-
RESCREENING_TARGETS: ["TARGET1,A B C", "TARGET2,A B C", "TARGET3, A B C"]
76-
81+
RESCREENING_TARGETS: ["6ACD, A B C", "6NB3, A B C", "7BNN, A B C"]
7782

7883
TARGET_URL: https://files.rcsb.org/download
79-
GRID_DIR: "/<GRID_DIRECTORY>"
80-
81-
#Name your experiment here or change it in the final json file
82-
83-
EXPERIMENT_NAME: "<Name>"
84+
GRID_DIR: "GRID"
8485

85-
#parameters for energy minimization
86+
# parameters for energy minimization
8687
ENERGY_MIN_ALGORITHM: 'cg'
8788
CONVERGENCE_CRITERIA: '1e-6'
8889
STEPS: '2500'

profiles/Mogon-NHR/config.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
default-resources:
2-
slurm_partition: smallcpu
2+
executor: "slurm"
3+
slurm_account: "nhr-zdvhpc"
4+
slurm_partition: "smallcpu"
35
mem_mb_per_cpu: 1800
46
runtime: "30m"
57
clusters: "mogonnhr"
68

79
set-resources:
810
docking:
911
mem_mb_per_cpu: 3000
10-
slurm_partition: parallel
11-
ntasks: 512
12+
slurm_partition: "parallel"
13+
tasks: 512
14+
runtime: 500
1215
energyMin:
1316
mem_mb: 350
1417
runtime: 90

workflow/Snakefile

Lines changed: 9 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,18 @@ from snakemake.utils import min_version
55

66
min_version("7.19.1") # this is where SLURM support was introduced
77

8-
INPUT_DIR = config["INPUT_DIR"]
9-
10-
MIN_DIR = config["PREPARED_LIGAND_DIR"]
11-
12-
PREPARED_DIR = config["PREPARED_DATA_DIR"]
13-
14-
OUTPUT_DIR = config["OUTPUT_DIR"]
15-
16-
TMP_DIR = config["TEMP_DATA_DIR"]
178

189
LOCAL_INPUT = config["LOCAL_INPUT_DIR"]
19-
2010
DATABASE = config["DATABASE"]
21-
2211
SUBSET = config["SUBSET"]
23-
2412
RESCREENING_TARGETS = config["RESCREENING_TARGETS"]
2513

2614

2715
def generateOutput(wildcards):
28-
irods = path.join(OUTPUT_DIR, "results", "irods.json")
16+
irods = path.join("results", "irods.json")
2917
if config["RESCREENING"] == "TRUE":
3018
out = expand(
3119
path.join(
32-
OUTPUT_DIR,
3320
"results",
3421
"rescreening_{percentage}",
3522
"{receptorID}",
@@ -40,54 +27,22 @@ def generateOutput(wildcards):
4027
combAll=combAll,
4128
)
4229
hist = expand(
43-
path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
30+
path.join("results", "{receptorID}_hist.png"),
4431
receptorID=config["TARGETS"][0].split(",")[0],
4532
)
46-
4733
return hist + out + [irods]
48-
4934
else:
5035
out = expand(
51-
path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
36+
path.join("results", "{receptorID}_{percentage}.csv"),
5237
receptorID=config["TARGETS"][0].split(",")[0],
5338
percentage=config["RESULT_NUMBER"],
5439
)
5540
hist = expand(
56-
path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
41+
path.join("results", "{receptorID}_hist.png"),
5742
receptorID=config["TARGETS"][0].split(",")[0],
5843
)
5944
return hist + out + [irods]
6045

61-
62-
localrules:
63-
all,
64-
generateIRODS,
65-
dockingResultsTxt,
66-
removeDuplicateLigands,
67-
makeVenn,
68-
prepareLigands2,
69-
mergeDocking2,
70-
bestLigands,
71-
prepareSecondDocking,
72-
convertMol2,
73-
makeReceptorPDBQT,
74-
mergeDocking,
75-
mergeLocalInput,
76-
split,
77-
split2,
78-
targetProtein,
79-
getZINCdata,
80-
getZINCSubsets,
81-
gunzip,
82-
ENAMINEdownload,
83-
prepareReceptor,
84-
prepareDocking,
85-
prepareLibrary,
86-
prepareGeometry,
87-
makeHistogram,
88-
cleanLigands,
89-
90-
9146
targetList = [] # get ProteinIDs from configfile for rescreening
9247
for i in config["RESCREENING_TARGETS"]:
9348
targetList.append(i.split(",")[0])
@@ -102,7 +57,6 @@ combAll = "_".join(targetList) # combine all rescreening targets
10257

10358
def getAllVenn(wildcards):
10459
path.join(
105-
OUTPUT_DIR,
10660
"output",
10761
"rescreening",
10862
"{receptorID}",
@@ -114,7 +68,6 @@ def IRODSinput(wildcards):
11468
if config["RESCREENING"] == "TRUE":
11569
out = expand(
11670
path.join(
117-
OUTPUT_DIR,
11871
"results",
11972
"rescreening_{percentage}",
12073
"{receptorID}",
@@ -126,7 +79,7 @@ def IRODSinput(wildcards):
12679
)
12780
else:
12881
out = expand(
129-
path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
82+
path.join("results", "{receptorID}_{percentage}.csv"),
13083
receptorID=config["TARGETS"][0].split(",")[0],
13184
percentage=config["RESULT_NUMBER"],
13285
)
@@ -150,13 +103,14 @@ rule generateIRODS:
150103
input:
151104
IRODSinput,
152105
output:
153-
path.join(OUTPUT_DIR, "results", "irods.json"),
106+
path.join("results", "irods.json"),
154107
log:
155108
"logs/generateIRODS.log",
156109
script:
157110
"scripts/generateIRODS.py"
158111

159112

160-
include: "rules/analyse.smk"
161-
include: "rules/docking.smk"
162113
include: "rules/preparation.smk"
114+
include: "rules/docking.smk"
115+
include: "rules/analyse.smk"
116+

0 commit comments

Comments
 (0)