From eea4e8285407ec5324777064f8a0648fd58d95d3 Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Thu, 31 Mar 2022 17:12:45 +0200 Subject: [PATCH 1/2] update pipeline for pruning --- pipeline/GoodPanGenomeGraph.snakefile | 41 ++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/pipeline/GoodPanGenomeGraph.snakefile b/pipeline/GoodPanGenomeGraph.snakefile index dba138f..8a38f57 100644 --- a/pipeline/GoodPanGenomeGraph.snakefile +++ b/pipeline/GoodPanGenomeGraph.snakefile @@ -284,11 +284,9 @@ done rule GenRawGenomeGraph: input: TRfa = expand(outdir + "{{genome}}.{hap}.tr.fasta", hap=haps), - ILbam = lambda wildcards: [bams[wildcards.genome]] if prune else [], mapping = outdir + "OrthoMap.v2.tsv", output: rawPBkmers = expand(outdir + "{{genome}}.rawPB.{kmerType}.kmers", kmerType=kmerTypes), - rawILkmers = [outdir + "{genome}.rawIL.tr.kmers"] if prune else [] resources: cores = 24 if prune else 1, mem = lambda wildcards, attempt: 55 + 20*(attempt-1) @@ -312,13 +310,36 @@ cd {params.od} module load gcc {params.sd}/bin/vntr2kmers_thread -g -m <(cut -f $(({params.hi}+1)),$(({params.hi}+2)) {input.mapping}) -k {params.ksize} -fs {params.FS} -ntr {params.FS} -o {wildcards.genome}.rawPB -fa 2 {input.TRfa} +""" -if [ {params.prune} == "1" ]; then +rule GenPrune: + input: + ILbam = lambda wildcards: [bams[wildcards.genome]] if prune else [], + serials = lambda wildcards: expand(outdir + "{genome}.rawPB.{binKmerType}", genome=wildcards.genome,binKmerType=binKmerTypes) + output: + rawILkmers = outdir + "{genome}.rawIL.tr.kmers" + resources: + cores=1 + params: + name = "GenRawGenomeGraph", + copts = copts, + sd = srcdir, + od = outdir, + ksize = ksize, + FS = FS, + cth = cth, + rth = rth, + rstring = rstring, + thcth = thcth, + hi = lambda wildcards: 2*genomes.index(wildcards.genome), + cores=1, + prune = int(prune) + shell: + ''' samtools fasta -@2 -n {input.ILbam} | {params.sd}/bin/bam2pe -fai /dev/stdin | {params.sd}/bin/danbing-tk -g {params.thcth} -k {params.ksize} -qs {params.od}/{wildcards.genome}.rawPB -fai /dev/stdin -o {wildcards.genome}.rawIL -p {resources.cores} -cth {params.cth} -rth {params.rth} -fi -""" + ''' rule EvalRawGenomeGraph: @@ -398,9 +419,15 @@ module load gcc {params.sd}/bin/genPanKmers -o pan -m - -k {params.kmerpref} """ +def get_filename(genome): + if genome == 'pan': + return 'pan' + else: + return f'{genome}.rawPB' + rule GenSerializedGraphAndIndex: input: - panKmers = expand(outdir + "pan.{kmerType}.kmers", kmerType=kmerTypes) + txtKmers = expand(outdir + "{{genome}}.{kmerType}.kmers", kmerType=kmerTypes) output: binKmers = expand(outdir + "pan.{binKmerType}", binKmerType=binKmerTypes) resources: @@ -411,7 +438,7 @@ rule GenSerializedGraphAndIndex: copts = copts, sd = srcdir, od = outdir, - pref = f"{outdir}/pan" + pref = f"{outdir}/{{wildcards.genome}}" shell:""" cd {params.od} ulimit -c 20000 From 4a3026b1a4886eb9d50e9e155a09c13ea904eb1a Mon Sep 17 00:00:00 2001 From: Alex Leonard Date: Fri, 1 Apr 2022 10:17:48 +0200 Subject: [PATCH 2/2] Fixing missed pan references --- pipeline/GoodPanGenomeGraph.snakefile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pipeline/GoodPanGenomeGraph.snakefile b/pipeline/GoodPanGenomeGraph.snakefile index 8a38f57..09270b3 100644 --- a/pipeline/GoodPanGenomeGraph.snakefile +++ b/pipeline/GoodPanGenomeGraph.snakefile @@ -312,7 +312,7 @@ module load gcc {params.sd}/bin/vntr2kmers_thread -g -m <(cut -f $(({params.hi}+1)),$(({params.hi}+2)) {input.mapping}) -k {params.ksize} -fs {params.FS} -ntr {params.FS} -o {wildcards.genome}.rawPB -fa 2 {input.TRfa} """ -rule GenPrune: +rule GeneratePruningAlignments: input: ILbam = lambda wildcards: [bams[wildcards.genome]] if prune else [], serials = lambda wildcards: expand(outdir + "{genome}.rawPB.{binKmerType}", genome=wildcards.genome,binKmerType=binKmerTypes) @@ -331,11 +331,10 @@ rule GenPrune: rth = rth, rstring = rstring, thcth = thcth, - hi = lambda wildcards: 2*genomes.index(wildcards.genome), - cores=1, - prune = int(prune) + hi = lambda wildcards: 2*genomes.index(wildcards.genome) shell: ''' + cd {params.od} samtools fasta -@2 -n {input.ILbam} | {params.sd}/bin/bam2pe -fai /dev/stdin | {params.sd}/bin/danbing-tk -g {params.thcth} -k {params.ksize} -qs {params.od}/{wildcards.genome}.rawPB -fai /dev/stdin -o {wildcards.genome}.rawIL -p {resources.cores} -cth {params.cth} -rth {params.rth} @@ -429,7 +428,7 @@ rule GenSerializedGraphAndIndex: input: txtKmers = expand(outdir + "{{genome}}.{kmerType}.kmers", kmerType=kmerTypes) output: - binKmers = expand(outdir + "pan.{binKmerType}", binKmerType=binKmerTypes) + binKmers = expand(outdir + "{{genome}}.{binKmerType}", binKmerType=binKmerTypes) resources: cores = 2, mem = lambda wildcards, attempt: 90+20*(attempt-1) @@ -445,6 +444,6 @@ ulimit -c 20000 module load gcc {params.sd}/bin/ktools serialize {params.pref} -{params.sd}/bin/ktools ksi pan.tr.kmers >{params.pref}.tr.ksi +{params.sd}/bin/ktools ksi {wildcards.genome}.tr.kmers >{params.pref}.tr.ksi """