From afacd9ce7b45bf89bd568410d0e8b207cffa585b Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:50:36 -0800 Subject: [PATCH 1/9] Updates to Nextflow code - added generate_summary script and corresponding compile_summary process in order to correctly sort summary file in same order as input sample ID file - removed sample tmp files as a published output - migrated from publishDir to workflow.output for most processes - updated kraken2 DB build to include other options based on provided input - added nextflow.schema for input validation and help generation - update kraken2-build to use k2 wrapper throughout - started updating protocol text generation to allow passing protocol text from existing kraken2 DB folder - fixed kraken2 output file gzip and rename procedure --- .../workflow_code/bin/generate_protocol.sh | 0 .../workflow_code/bin/generate_summary.sh | 15 +++ .../workflow_code/envs/kraken2.yaml | 1 - .../workflow_code/main.nf | 102 +++++++++++------- .../modules/generate_protocol.nf | 11 +- .../workflow_code/modules/kraken2.nf | 52 +++++---- .../workflow_code/modules/kraken2_db.nf | 70 +++++++++--- .../workflow_code/modules/summary.nf | 24 ++++- .../workflow_code/modules/utils.nf | 1 - .../workflow_code/nextflow.config | 44 ++++++-- .../workflow_code/nextflow_schema.json | 75 +++++++++++++ 11 files changed, 303 insertions(+), 92 deletions(-) mode change 100644 => 100755 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh create mode 100755 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh new file mode 100755 index 000000000..96cae5086 --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +sample_IDs_file=${1} +host=${2} +input_dir=${3} +output_file=${4} + +# starting output file +printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_${host}_reads_removed\n" > ${output_file} + +# looping through all input files and generating columns for final table +for sample in $(cat ${sample_IDs_file}) +do + cat ${input_dir}/${sample}-removal-info.tmp >> ${output_file} +done \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml index 433122920..2d04cb8f5 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml @@ -1,6 +1,5 @@ channels: - conda-forge - bioconda - - defaults dependencies: - kraken2=2.1.6 diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf index da11473ff..85501a16f 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf @@ -1,31 +1,29 @@ // main.nf nextflow.enable.dsl=2 -// Terminal text color definitions -c_back_bright_red = "\u001b[41;1m"; -c_reset = "\033[0m"; +include { paramsHelp } from 'plugin/nf-schema' +include { validateParameters } from 'plugin/nf-schema' -include { KRAKEN2_DB } from './modules/kraken2_db.nf' -include { KRAKEN_2 } from './modules/kraken2.nf' -include { SUMMARY } from './modules/summary.nf' +include { KRAKEN2_DB } from './modules/kraken2_db.nf' +include { KRAKEN_2 } from './modules/kraken2.nf' +include { SUMMARY } from './modules/summary.nf' +include { COMPILE_SUMMARY } from './modules/summary.nf' -include { SOFTWARE_VERSIONS } from './modules/utils.nf' -include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' +include { SOFTWARE_VERSIONS } from './modules/utils.nf' +include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' workflow { - if (!params.ref_dbs_Dir){ - error("""${c_back_bright_red}INPUT ERROR! - Please supply the path to the directory storing kraken2 reference databases - by passing --ref_dbs_Dir. - ${c_reset}""") - } + main: + + // check input parameters + validateParameters() // Capture software versions - software_versions_ch = Channel.empty() + software_versions_ch = channel.empty() // Get host info - host_info = Channel + host_info = channel .fromPath(params.hosts_table) .splitCsv(header:true) .filter { row -> row.name.toLowerCase() == params.host.toLowerCase() } // match host @@ -38,19 +36,18 @@ workflow { // Check if kraken2 database already exists or needs to be built def host_id = params.host.replaceAll(' ', '_').toLowerCase() - def host_db = file("${params.ref_dbs_Dir}/kraken2-${host_id}-db") + def host_db = file("${params.ref_dbs_dir}/kraken2-${host_id}-db") def db_exists = host_db.exists() - if (db_exists) { - database_ch = Channel.value(host_db) - } + if (db_exists) + database_ch = channel.value(host_db) else { - build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, hostID, fasta) } - KRAKEN2_DB(build_ch) + build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, host_id, fasta) } + KRAKEN2_DB(build_ch, params.ref_dbs_dir) database_ch = KRAKEN2_DB.out.first() } - Channel + channel .fromPath(params.sample_id_list) .splitText() .map { it.trim() } @@ -66,31 +63,19 @@ workflow { } .set {generated_reads_ch} - KRAKEN_2(database_ch, generated_reads_ch) + KRAKEN_2(database_ch, generated_reads_ch, params.out_suffix) KRAKEN_2.out.version | mix(software_versions_ch) | set{software_versions_ch} - // Generate summary and compile one file + // Generate summary and compile into one file SUMMARY(KRAKEN_2.out.output, KRAKEN_2.out.report) - SUMMARY.out - .collect() - .subscribe { summary_files -> - def outfile = file("${params.outdir}/results/Host-read-removal-summary.tsv") - def header = "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_host_reads_removed\n" - outfile.text = header + summary_files.collect { it.text }.join() - - // summary.tmp cleanup - summary_files.each { f -> - def tmpFile = f.toFile() - tmpFile.delete() - } - } + COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), params.host) // Software Version Capturing - combining all captured software versions nf_version = "Nextflow Version ".concat("${nextflow.version}") - nextflow_version_ch = Channel.value(nf_version) + nextflow_version_ch = channel.value(nf_version) // Write software versions to file - software_versions_ch | map { it.text.strip() } + software_versions_ch | map { it -> it.text.strip() } | unique | mix(nextflow_version_ch) | collectFile({it -> it}, newLine: true, cache: false) @@ -99,6 +84,41 @@ workflow { // Protocol always needs name, refseq ID, and genome build protocol_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, refseq, genome) } - GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out) + def protocol = host_db.resolve('read-removal-protocol-text.txt') + protocol_out = GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out, channel.value(protocol)) + + publish: + protocol_out = protocol_out + software_versions = SOFTWARE_VERSIONS.out + fastq_out = KRAKEN_2.out.host_removed + kraken2_out = KRAKEN_2.out.output + kraken2_report = KRAKEN_2.out.report + summary_stats = COMPILE_SUMMARY.out.summary_file + +} + +output { + protocol_out { + path "processing_info" + } + software_versions { + path "processing_info" + } + + fastq_out { + path "${params.reads_outdir}" + } + + kraken2_out { + path "results/kraken2-output" + } + + kraken2_report { + path "results/kraken2-output" + } + + summary_stats { + path "results" + } } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf index 090043e92..23273bf00 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf @@ -1,17 +1,22 @@ process GENERATE_PROTOCOL { beforeScript "chmod +x ${projectDir}/bin/*" - tag "Generating your analysis protocol..." - publishDir "${params.outdir}/processing_info" + tag "Generating analysis protocol text..." input: tuple val(host), val(refSeq_ID), val(genome) - path(software_versions) + path software_versions + path protocol output: path("protocol.txt") script: + if (protocol.exists()) + """ + cp ${protocol} protocol.txt + """ + else """ generate_protocol.sh ${software_versions} ${host} "${refSeq_ID}" ${genome} > protocol.txt """ diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf index eb13355dc..1585c6d5f 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf @@ -1,38 +1,50 @@ process KRAKEN_2 { tag "${meta.id}" - publishDir "${params.reads_outdir}", pattern: '*.fastq.gz' - publishDir "${params.outdir}/results/kraken2-output", pattern: '*.{txt,tsv}' input: path database tuple val(meta), path(reads) + val out_suffix output: - path "${meta.id}*_HRremoved_*.gz", emit: host_removed - path "${meta.id}-kraken2-output.txt", emit: output - path "${meta.id}-kraken2-report.tsv", emit: report + path "${meta.id}*${out_suffix}.fastq.gz", emit: host_removed + path("${meta.id}-kraken2-output.txt"), emit: output + path("${meta.id}-kraken2-report.tsv"), emit: report path("versions.txt"), emit: version script: - def pe_flag = meta.paired_end ? "--paired" : "" - def input = meta.paired_end ? "${reads[0]} ${reads[1]}" : "${reads}" - def output = meta.paired_end ? "${meta.id}_R#.fastq" : "${meta.id}.fastq" - """ - kraken2 --db $database --gzip-compressed \ - --threads ${task.cpus} --use-names ${pe_flag} \ + if (meta.paired_end) + """ + kraken2 --db $database --gzip-compressed \ + --threads ${task.cpus} --use-names --paired \ --output ${meta.id}-kraken2-output.txt \ --report ${meta.id}-kraken2-report.tsv \ - --unclassified-out ${output} \ - ${input} + --unclassified-out ${meta.id}_R#.fastq \ + ${reads[0]} ${reads[1]} - # Compress intermediate FASTQ files - gzip ${input} + # Rename and compress files to final output names + mv "${meta.id}_R_1.fastq" "${meta.id}_R1${out_suffix}.fastq" && \ + gzip ${meta.id}_R1${out_suffix}.fastq + + mv "${meta.id}_R_2.fastq" "${meta.id}_R2${out_suffix}.fastq" && \ + gzip ${meta.id}_R2${out_suffix}.fastq - # Rename compressed files to final output names - mv ${meta.id}_R_1.fastq ${meta.id}${params.R1_out_suffix} - mv ${meta.id}_R_2.fastq ${meta.id}${params.R2_out_suffix} + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else + """ + kraken2 --db $database --gzip-compressed \ + --threads ${task.cpus} --use-names \ + --output ${meta.id}-kraken2-output.txt \ + --report ${meta.id}-kraken2-report.tsv \ + --unclassified-out ${meta.id}.fastq \ + ${reads} - echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt - """ + # Rename and compress files to final output names + test -f "${meta.id}.fastq" && mv "${meta.id}.fastq" "${meta.id}${out_suffix}.fastq" && \ + gzip ${meta.id}${out_suffix}.fastq + + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf index d97e3f4fa..6033893d8 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf @@ -1,26 +1,70 @@ process KRAKEN2_DB { - tag "Downloading host reads database to ${params.ref_dbs_Dir}" - publishDir "${params.ref_dbs_Dir}", mode: 'copy' + tag "Creating host reads database in ${ref_dbs_dir}" + publishDir "${ref_dbs_dir}", mode: 'copy' input: - tuple val(host), val(host_id), val(fasta_url) + tuple val(host_name), val(host_url) , path(host_fasta) + path ref_dbs_dir output: - path "kraken2-${host_id}-db/" + path("kraken2-${host_name}-db/"), emit: krakendb_dir + path("versions.txt"), emit: version script: - """ - k2 download-taxonomy --db kraken2-${host_id}-db/ + if (host_url != null) + """ + echo "Downloading and unpacking database from ${host_url} + wget -O ${host_name}.tar.gz --timeout=3600 --tries=0 --continue ${host_url} - # Download FASTA file and uncompress it - wget -q ${fasta_url} -O host_assembly.fasta.gz - gunzip -c host_assembly.fasta.gz > host_assembly.fasta + mkdir kraken2-${host_name}-db/ && tar -zxvf -C kraken2-${host_name}-db/ - kraken2-build --add-to-library host_assembly.fasta --db kraken2-${host_id}-db/ --threads ${task.cpus} --no-masking + # Cleaning up + [ -f ${host_name}.tar.gz ] && rm -rf ${host_name}.tar.gz - kraken2-build --build --db kraken2-${host_id}-db/ --threads ${task.cpus} + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else if (host_fasta != null) + """ + echo "Attempting to build a custom ${host_name} reference database from ${host_fasta}" - kraken2-build --clean --db kraken2-${host_id}-db/ + # install taxonomy + k2 download-taxonomy --db kraken2-${host_name}-db/ + + # add sequence to database's genomic library + k2 add-to-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --files ${host_fasta} --no-masking + + # build the kraken2 database + k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --kmer-len 35 --minimizer-len 31 + + # remove intermediate files + k2 clean --db kraken2-${host_name}-db/ + + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else if (host_name != null) + """ + echo "Download and build kraken reference for named host: ${host_name}" + + # download genomic sequences + k2 download-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --library ${host_name} --no-masking + + # install taxonomy + k2 download-taxonomy --db kraken2-${host_name}-db/ + + # build the kraken2 database + k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --kmer-len 35 --minimizer-len 31 + + # remove intermediate files + k2 clean --db kraken2-${host_name}-db/ - """ + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else + error "Input error, host_name, host_url, and host_fasta are all set to null. Please supply at least one valid parameter for database creation" + + } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf index 749c273eb..53f7c2540 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf @@ -1,14 +1,13 @@ process SUMMARY { tag "${kraken_output.simpleName.replaceFirst(/-kraken2-output$/, '')}" - publishDir "${params.outdir}/results/kraken2-output" input: path kraken_output path kraken_report output: - path "*-removal-info.tmp" + path "*-removal-info.tmp", emit: sample_stats script: """ @@ -18,6 +17,25 @@ process SUMMARY { fragments_retained=\$(grep -w -m 1 "unclassified" $kraken_report | cut -f 2) perc_removed=\$(printf "%.2f\\n" \$(echo "scale=4; 100 - \$fragments_retained / \$total_fragments * 100" | bc -l)) - echo -e "\$meta_id\\t\$total_fragments\\t\$fragments_retained\\t\$perc_removed\\n" > \$meta_id-removal-info.tmp + echo -e "\$meta_id\\t\$total_fragments\\t\$fragments_retained\\t\$perc_removed" > \$meta_id-removal-info.tmp + """ +} + +process COMPILE_SUMMARY { + + tag "Generating summary statistics..." + beforeScript "chmod +x ${projectDir}/bin/*" + + input: + path summary_tmp_files + path sample_IDs_file + val host + + output: + path "${host}-read-removal-summary.tsv", emit: summary_file + + script: + """ + generate_summary.sh ${sample_IDs_file} ${host} ./ ${host}-read-removal-summary.tsv """ } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf index 1265c05e5..816825d72 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf @@ -1,7 +1,6 @@ process SOFTWARE_VERSIONS { tag "Writing out software versions..." - publishDir "${params.outdir}/processing_info" input: path(software_versions) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config index 4ffca71d8..3ffd21694 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config @@ -1,3 +1,16 @@ +/* +---------------------------------------------------------------------------------------- + GeneLab Data Processing Remove Host Reads Workflow Nextflow config file +---------------------------------------------------------------------------------------- + Default config options for all compute environments +---------------------------------------------------------------------------------------- +*/ + +// Plugins +plugins { + id 'nf-schema@2.6.1' +} + params { is_single = false // Boolean to set if the reads are single-end sample_id_list = null // Path to Sample_ID list @@ -9,13 +22,13 @@ params { host = "human" hosts_table = "$projectDir/assets/hosts.csv" - ref_dbs_Dir = null // Path to kraken2 database (or where it will be downloaded to if it's not set up yet), required + ref_dbs_dir = null // Path to kraken2 database (or where it will be downloaded to if it's not set up yet), required - outdir = "${launchDir}" - reads_outdir = "${params.outdir}/${params.host}-removed-reads" // Output directory to hold -removed reads - R1_out_suffix = "_R1_HRremoved_raw.fastq.gz" - R2_out_suffix = "_R2_HRremoved_raw.fastq.gz" - single_out_suffix = "_HRremoved_raw.fastq.gz" + reads_outdir = "${params.host}-removed-reads" // Output directory to hold -removed reads + out_suffix = "_HRrm" + + publish_dir_mode = 'link' // Published outputs may be symlinks if using containerized environments + trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') } profiles { @@ -88,16 +101,27 @@ process { } } -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +workflow { + output { + mode = params.publish_dir_mode + } +} + timeline { enabled = true - file = "${params.outdir}/processing_info/execution_timeline_${trace_timestamp}.html" + file = "nextflow_info/execution_timeline_${params.trace_timestamp}.html" } report { enabled = true - file = "${params.outdir}/processing_info/execution_report_${trace_timestamp}.html" + file = "nextflow_info/execution_report_${params.trace_timestamp}.html" } trace { enabled = true - file = "${params.outdir}/processing_info/execution_trace_${trace_timestamp}.txt" + file = "nextflow_info/execution_trace_${params.trace_timestamp}.txt" +} + +validation { + help { + enabled = true + } } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json new file mode 100644 index 000000000..190627e5f --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json @@ -0,0 +1,75 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com///nextflow_schema.json", + "title": "NASA GeneLab Data Processing Host Read Removal workflow parameters", + "description": "GeneLab workflow for removing human reads in metagenomics sequencing data, as described in GL-DPPD-7105-B", + "type": "object", + "properties": { + "is_single": { + "type": "boolean", + "default": false, + "description": "Are the reads single-ended?" + }, + "sample_id_list": { + "type": "string", + "description": "list of unique sample IDs that identifies the files to process" + }, + "reads_dir": { + "type": "string", + "description": "Path to input reads" + }, + "R1_suffix": { + "type": "string", + "default": "_R1_raw.fastq.gz", + "description": "file suffix for mate1 reads" + }, + "R2_suffix": { + "type": "string", + "default": "_R2_raw.fastq.gz", + "description": "file suffix for mate2 reads" + }, + "single_suffix": { + "type": "string", + "default": "_raw.fastq.gz", + "description": "file suffix for single-ended reads" + }, + "host": { + "type": "string", + "default": "human", + "description": "simple name of host organism" + }, + "hosts_table": { + "type": "string", + "default": "${projectDir}/assets/hosts.csv", + "description": "comma-separated table of host information", + "hidden": true + }, + "ref_dbs_dir": { + "type": "string", + "description": "path to kraken2 databases" + }, + "reads_outdir": { + "type": "string", + "default": "${host}-removed-reads", + "description": "output folder for host-removed reads" + }, + "out_suffix": { + "type": "string", + "default": "_HRrm", + "description": "file suffix for all output read files" + }, + "publish_dir_mode": { + "type": "string", + "default": "link", + "description": "Nextflow workflow publish mode", + "enum": ["copy", "copyNoFollow", "link", "move", "relink", "symlink"], + "hidden": true + }, + "trace_timestamp": { + "type": "string", + "default": "2026-02-13_09-22-42", + "hidden": true + } + }, + "required": ["ref_dbs_dir", "sample_id_list", "reads_dir"] +} From 7d041bbf60fe939b2f9d955163f7f012305fde3d Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Wed, 4 Mar 2026 11:29:53 -0800 Subject: [PATCH 2/9] Implement protocol generation and update host reference handling in Nextflow workflow --- .../workflow_code/assets/hosts.csv | 2 +- .../workflow_code/bin/generate_protocol.py | 165 ++++++++++++++++++ .../workflow_code/bin/generate_protocol.sh | 11 -- .../workflow_code/envs/dp_tools.yaml | 10 ++ .../workflow_code/main.nf | 76 ++++---- .../modules/generate_protocol.nf | 31 +++- .../workflow_code/modules/kraken2_db.nf | 58 +++--- .../modules/parse_hosts_table.nf | 51 ++++++ .../{utils.nf => software_versions.nf} | 0 .../workflow_code/nextflow.config | 20 ++- .../workflow_code/nextflow_schema.json | 16 +- 11 files changed, 368 insertions(+), 72 deletions(-) create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.py delete mode 100755 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/dp_tools.yaml create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/parse_hosts_table.nf rename Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/{utils.nf => software_versions.nf} (100%) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv index b0bf24d35..979befee2 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv @@ -1,5 +1,5 @@ name,species,refseq,genome,fasta -human,Homo sapiens,GCF_000001405.39,GRCh38.p13,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz +human,Homo sapiens mouse,Mus musculus,GCF_000001635.27,GRCm39,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz arabidopsis,Arabidopsis thaliana,GCF_000001735.4,TAIR10,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz lettuce,Lactuca sativa,GCF_002870075.4,Lsat_Salinas_v11,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/870/075/GCF_002870075.4_Lsat_Salinas_v11/GCF_002870075.4_Lsat_Salinas_v11_genomic.fna.gz diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.py b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.py new file mode 100644 index 000000000..f47003b9e --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +""" +This script generates a protocol text file for GeneLab Remove-Host-Reads workflow. +It reads software versions from a text file generated upstream, +and assembly info from a manifest.txt file generated by k2 download-library (if available in the database directory), +or from assembly name and accession parameters passed in from Nextflow (resolved upstream from workflow params or hosts.csv). +If neither source is available, assembly info defaults to 'unknown'. +""" + +import argparse +import os +import re + + +# --------------------------------------------------------------------------- +# Hardcoded Kraken2 build parameters +# --------------------------------------------------------------------------- +KMER_LEN = 35 +MINIMIZER_LEN = 31 +NO_MASKING = True + + +# --------------------------------------------------------------------------- +# Kraken2 version from software_versions.txt +# --------------------------------------------------------------------------- + +def parse_kraken2_version(software_versions_path): + """ + Parse Kraken2 version from software_versions.txt. + Expects a line like: Kraken2 2.1.6 + """ + if not software_versions_path or not os.path.exists(software_versions_path): + print(f"[WARNING] software_versions.txt not found at '{software_versions_path}'. " + "Version will be reported as 'unknown'.") + return "unknown" + + with open(software_versions_path, "r") as f: + for line in f: + match = re.search(r"[Kk]raken2?\s+v?([\d.]+)", line) + if match: + return match.group(1) + + print("[WARNING] Could not find Kraken2 version in software_versions.txt.") + return "unknown" + + +# --------------------------------------------------------------------------- +# Assembly from manifest.txt +# --------------------------------------------------------------------------- + +def assemblies_from_manifest(manifest_path): + """ + Parse manifest.txt to extract accession(s) and assembly name(s). + + Each line looks like: + genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz + """ + assemblies = [] + with open(manifest_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + filename = os.path.basename(line) + match = re.match(r"(GCF_\d+\.\d+)_(.+?)_genomic\.fna\.gz", filename) + if match: + assemblies.append({ + "accession": match.group(1), + "assembly_name": match.group(2) + }) + else: + print(f"[WARNING] Could not parse manifest line: {line}") + + return assemblies if assemblies else None + + +# --------------------------------------------------------------------------- +# Protocol generation +# --------------------------------------------------------------------------- + +def format_assembly_list(assemblies): + """ + Format assembly list for protocol text. + e.g. GRCh38.p14 (GCF_000001405.40) and T2T-CHM13v2.0 (GCF_009914755.1) + """ + parts = [f"{a['assembly_name']} ({a['accession']})" for a in assemblies] + if len(parts) == 1: + return parts[0] + elif len(parts) == 2: + return f"{parts[0]} and {parts[1]}" + else: + return ", ".join(parts[:-1]) + f", and {parts[-1]}" + + +def generate_protocol(assemblies, host, kraken2_version): + """Generate the protocol statement with hardcoded build parameters.""" + assembly_str = format_assembly_list(assemblies) + masking_str = ", with --no-masking" if NO_MASKING else "" + + return ( + f"FASTQ files were screened using Kraken2 v{kraken2_version} against a {host} genome reference database " + f"that was constructed from NCBI's RefSeq {assembly_str}{masking_str}, " + f"and defaults of kmer-length {KMER_LEN} and minimizer-length of {MINIMIZER_LEN}, " + f"and is fully compatible with Kraken2 v{kraken2_version}. " + f"Reads classified as host were removed after being quantified and expressed as a percentage of the total." + ) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Generate a Kraken2 host depletion protocol statement." + ) + parser.add_argument("--db-dir", required=True, + help="Path to kraken2--db/ directory") + parser.add_argument("--software-versions", required=True, + help="Path to software_versions.txt") + parser.add_argument("--host", required=True, + help="Host organism of interest") + parser.add_argument("--assembly-name", default="", + help="Assembly name resolved upstream in Nextflow (e.g. GRCm39)") + parser.add_argument("--assembly-acc", default="", + help="Assembly accession resolved upstream in Nextflow (e.g. GCF_000001635.27)") + parser.add_argument("--output", default="protocol.txt", + help="Output file path (default: protocol.txt)") + + args = parser.parse_args() + + # Parse Kraken2 version + kraken2_version = parse_kraken2_version(args.software_versions) + print(f"[INFO] Kraken2 version: {kraken2_version}") + + # Resolve assembly info + assemblies = None + + manifest_path = os.path.join(args.db_dir, "manifest.txt") + + if os.path.exists(manifest_path): + # Priority 1: manifest.txt in db_dir + assemblies = assemblies_from_manifest(manifest_path) + elif args.assembly_name and args.assembly_acc: + # Priority 2: assembly name + accession passed in from Nextflow + assemblies = [{"accession": args.assembly_acc, "assembly_name": args.assembly_name}] + else: + # Priority 3: fallback to unknown + print("[WARNING] Could not resolve assembly info from manifest.txt or params. " + "Defaulting to 'unknown'.") + assemblies = [{"accession": "unknown", "assembly_name": "unknown"}] + + + # Generate and save protocol + protocol = generate_protocol(assemblies, args.host, kraken2_version) + + with open(args.output, "w") as f: + f.write(protocol + "\n") + + print(f"\n[PROTOCOL]\n{protocol}") + print(f"\n[INFO] Protocol saved to: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh deleted file mode 100755 index e11cdc071..000000000 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -KRAKEN2=$(grep -i 'kraken2' $1 | sort -u |awk '{print $2}' |sed -E 's/v//') - -HOST=$2 # ex: mouse -REFSEQ_ID=$3 -GENOME=$4 - -PROTOCOL="FASTQ files were screened using Kraken2 v"$KRAKEN2" against a "$HOST" genome reference database that was constructed from NCBI's RefSeq ("$REFSEQ_ID") "$GENOME", with --no-masking, and defaults of kmer-length 35 and minimizer-length of 31, and is fully compatible with Kraken2 v"$KRAKEN2". Reads classified as host were removed after being quantified and expressed as a percentage of total reads." - -echo ${PROTOCOL} \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/dp_tools.yaml b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/dp_tools.yaml new file mode 100644 index 000000000..4062f6e83 --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/dp_tools.yaml @@ -0,0 +1,10 @@ +name: dp_tools +channels: + - conda-forge +dependencies: + - setuptools + - zip=3.0 + - unzip=6.0 + - git=2.45.2 + - pip: + - git+https://github.com/torres-alexis/dp_tools.git@1.3.8 \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf index 85501a16f..0fa36d463 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf @@ -4,12 +4,14 @@ nextflow.enable.dsl=2 include { paramsHelp } from 'plugin/nf-schema' include { validateParameters } from 'plugin/nf-schema' +include { PARSE_HOSTS_TABLE } from './modules/parse_hosts_table.nf' + include { KRAKEN2_DB } from './modules/kraken2_db.nf' include { KRAKEN_2 } from './modules/kraken2.nf' include { SUMMARY } from './modules/summary.nf' include { COMPILE_SUMMARY } from './modules/summary.nf' -include { SOFTWARE_VERSIONS } from './modules/utils.nf' +include { SOFTWARE_VERSIONS } from './modules/software_versions.nf' include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' workflow { @@ -22,31 +24,40 @@ workflow { // Capture software versions software_versions_ch = channel.empty() - // Get host info - host_info = channel - .fromPath(params.hosts_table) - .splitCsv(header:true) - .filter { row -> row.name.toLowerCase() == params.host.toLowerCase() } // match host - .map { row -> - def host_id = row.name.replaceAll(' ', '_').toLowerCase() - tuple(row.name, host_id, row.species, row.refseq, row.genome, row.fasta) } - - host_info - .ifEmpty { error("INPUT ERROR: Host '${params.host}' not found in hosts table '${params.hosts_table}'") } - - // Check if kraken2 database already exists or needs to be built def host_id = params.host.replaceAll(' ', '_').toLowerCase() def host_db = file("${params.ref_dbs_dir}/kraken2-${host_id}-db") - def db_exists = host_db.exists() - - if (db_exists) - database_ch = channel.value(host_db) - else { - build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, host_id, fasta) } - KRAKEN2_DB(build_ch, params.ref_dbs_dir) - database_ch = KRAKEN2_DB.out.first() + + if (host_db.exists()) { + // Database already exists in storeDir - assembly name and accession might be used by GENERATE_PROTOCOL (if mainfest.txt doesn't exist in db folder) + reference_fasta = channel.value("") + reference_genome = params.assembly_name ? channel.value(params.assembly_name) : channel.value("") + reference_accession = params.assembly_acc ? channel.value(params.assembly_acc) : channel.value("") + + } else if ( params.db_url ) { + // Option 1: pre-built DB - only assembly_name and assembly_acc needed + reference_fasta = channel.value([]) + reference_genome = params.assembly_name ? channel.value(params.assembly_name) : channel.value("unknown") + reference_accession = params.assembly_acc ? channel.value(params.assembly_acc) : channel.value("unknown") + + } else if ( params.ref_fasta ) { + // Option 2: custom FASTA - assembly_name and assembly_acc should also be set + reference_fasta = channel.value(params.ref_fasta) + reference_genome = params.assembly_name ? channel.value(params.assembly_name) : channel.value("unknown") + reference_accession = params.assembly_acc ? channel.value(params.assembly_acc) : channel.value("unknown") + + } else { + // Option 3: parse hosts.csv + PARSE_HOSTS_TABLE(params.hosts_table, host_id) + reference_fasta = PARSE_HOSTS_TABLE.out.reference_fasta_url + reference_genome = PARSE_HOSTS_TABLE.out.reference_genome + reference_accession = PARSE_HOSTS_TABLE.out.reference_accession + // Note: for hosts like human where genome/accession are not in hosts.csv, + // PARSE_HOSTS_TABLE will emit null - protocol will fall back to manifest.txt } - + + KRAKEN2_DB(host_id, reference_fasta) + database_ch = KRAKEN2_DB.out.build.first() + channel .fromPath(params.sample_id_list) .splitText() @@ -64,28 +75,33 @@ workflow { .set {generated_reads_ch} KRAKEN_2(database_ch, generated_reads_ch, params.out_suffix) - KRAKEN_2.out.version | mix(software_versions_ch) | set{software_versions_ch} // Generate summary and compile into one file SUMMARY(KRAKEN_2.out.output, KRAKEN_2.out.report) COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), params.host) // Software Version Capturing - combining all captured software versions + KRAKEN_2.out.version | mix(KRAKEN2_DB.out.version.ifEmpty("")) + | mix(software_versions_ch) + | set{software_versions_ch} + nf_version = "Nextflow Version ".concat("${nextflow.version}") nextflow_version_ch = channel.value(nf_version) // Write software versions to file - software_versions_ch | map { it -> it.text.strip() } + software_versions_ch | filter { it } + | map { it -> it.text.strip() } | unique | mix(nextflow_version_ch) | collectFile({it -> it}, newLine: true, cache: false) | SOFTWARE_VERSIONS - - // Protocol always needs name, refseq ID, and genome build - protocol_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, refseq, genome) } - def protocol = host_db.resolve('read-removal-protocol-text.txt') - protocol_out = GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out, channel.value(protocol)) + protocol_out = GENERATE_PROTOCOL( + params.host, + reference_genome, + reference_accession, + SOFTWARE_VERSIONS.out, + database_ch) publish: protocol_out = protocol_out diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf index 23273bf00..abeb97b48 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf @@ -1,10 +1,10 @@ -process GENERATE_PROTOCOL { +/*process GENERATE_PROTOCOL { beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating analysis protocol text..." input: - tuple val(host), val(refSeq_ID), val(genome) + tuple val(host), val(genome), val(assembly_acc) path software_versions path protocol @@ -18,6 +18,31 @@ process GENERATE_PROTOCOL { """ else """ - generate_protocol.sh ${software_versions} ${host} "${refSeq_ID}" ${genome} > protocol.txt + generate_protocol.sh ${software_versions} ${host} "${assembly_acc}" ${genome} > protocol.txt """ +}*/ + +process GENERATE_PROTOCOL { + tag "Generating analysis protocol text for ${host}" + label 'python' + + input: + val(host) + val(genome) + val(assembly_acc) + path(software_versions) + path(db_dir) + + output: + path("protocol.txt"), emit: protocol + + script: + """ + generate_protocol.py \\ + --host ${host} \\ + --db-dir ${db_dir} \\ + --assembly-name "${genome}" \\ + --assembly-acc "${assembly_acc}" \\ + --software-versions ${software_versions} + """ } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf index 6033893d8..fe0a47fcf 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf @@ -1,49 +1,64 @@ process KRAKEN2_DB { - tag "Creating host reads database in ${ref_dbs_dir}" - publishDir "${ref_dbs_dir}", mode: 'copy' + tag "Creating host reads database in ${params.ref_dbs_dir}" + storeDir "${params.ref_dbs_dir}" input: - tuple val(host_name), val(host_url) , path(host_fasta) - path ref_dbs_dir + val(host_name) + val(host_fasta) output: - path("kraken2-${host_name}-db/"), emit: krakendb_dir - path("versions.txt"), emit: version + path("kraken2-${host_name}-db/"), emit: build + path("versions.txt"), emit: version, optional: true script: - if (host_url != null) + if (params.db_url) """ - echo "Downloading and unpacking database from ${host_url} - wget -O ${host_name}.tar.gz --timeout=3600 --tries=0 --continue ${host_url} + echo "Downloading and unpacking database from ${params.db_url}" + wget -O kraken2-${host_name}-db.tar.gz --timeout=3600 --tries=0 --continue ${params.db_url} - mkdir kraken2-${host_name}-db/ && tar -zxvf -C kraken2-${host_name}-db/ + tar -zxvf kraken2-${host_name}-db.tar.gz # Cleaning up - [ -f ${host_name}.tar.gz ] && rm -rf ${host_name}.tar.gz - - echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + [ -f kraken2-${host_name}-db.tar.gz ] && rm -rf kraken2-${host_name}-db.tar.gz """ - else if (host_fasta != null) + else if (host_fasta) """ echo "Attempting to build a custom ${host_name} reference database from ${host_fasta}" # install taxonomy k2 download-taxonomy --db kraken2-${host_name}-db/ + # handle fasta file if it is a URL or local file + if [[ "${host_fasta}" == http* ]]; then + # extract filename from URL, strip .gz suffix + fasta_filename=\$(basename "${host_fasta}" .gz) + wget -q ${host_fasta} -O \${fasta_filename}.gz + gunzip \${fasta_filename}.gz + else + # check if local file is gzipped + if [[ "${host_fasta}" == *.gz ]]; then + fasta_filename=\$(basename "${host_fasta}" .gz) + gunzip -c ${host_fasta} > \${fasta_filename} + else + # local uncompressed can be used directly + fasta_filename=\$(basename "${host_fasta}") + fi + fi + # add sequence to database's genomic library k2 add-to-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \ - --files ${host_fasta} --no-masking + --files \${fasta_filename} --no-masking # build the kraken2 database k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ --kmer-len 35 --minimizer-len 31 # remove intermediate files - k2 clean --db kraken2-${host_name}-db/ + k2 clean --db kraken2-${host_name}-db/ --log clean.log echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt """ - else if (host_name != null) + else if (host_name) """ echo "Download and build kraken reference for named host: ${host_name}" @@ -58,13 +73,16 @@ process KRAKEN2_DB { k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ --kmer-len 35 --minimizer-len 31 + # copy log files needed to extract relevant genomic info + cp kraken2-${host_name}-db/library/human/assembly_summary.txt kraken2-${host_name}-db/ + cp kraken2-${host_name}-db/library/human/manifest.txt kraken2-${host_name}-db/ + # remove intermediate files - k2 clean --db kraken2-${host_name}-db/ + k2 clean --db kraken2-${host_name}-db/ --log clean.log echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt """ else - error "Input error, host_name, host_url, and host_fasta are all set to null. Please supply at least one valid parameter for database creation" - + error "Input error, host_name, host_fasta, and db_url are all missing. Please supply at least one valid parameter for database creation" } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/parse_hosts_table.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/parse_hosts_table.nf new file mode 100644 index 000000000..09811c18d --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/parse_hosts_table.nf @@ -0,0 +1,51 @@ +process PARSE_HOSTS_TABLE { + + input: + val(hosts_table) + val(organism_key) + + output: + val(fasta_url), emit: reference_fasta_url + val(genome), emit: reference_genome + val(accession), emit: reference_accession + + exec: + def colorCodes = [ + c_line: "┅" * 70, + c_back_bright_red: "\u001b[41;1m", + c_bright_green: "\u001b[32;1m", + c_blue: "\033[0;34m", + c_yellow: "\u001b[33;1m", + c_reset: "\033[0m" + ] + + def organisms = [:] + println "${colorCodes.c_yellow}Fetching table from ${hosts_table}${colorCodes.c_reset}" + + new File(hosts_table).splitEachLine(",") {fields -> + organisms[fields[0]] = fields + } + + // Check if the organism exists in the table + if (organisms.containsKey(organism_key)) { + fasta_url = organisms[organism_key].size() > 4 ? organisms[organism_key][4] : null + genome = organisms[organism_key].size() > 3 ? organisms[organism_key][3] : null + accession = organisms[organism_key].size() > 2 ? organisms[organism_key][2] : null + + println "${colorCodes.c_blue}Hosts table values parsed for '${organism_key}':${colorCodes.c_bright_green}" + println "--------------------------------------------------" + println "- fasta_url: ${fasta_url}" + println "- genome assembly: ${genome}" + println "- assembly accession: ${accession}${colorCodes.c_reset}" + println "--------------------------------------------------" + } + else { + fasta_url = null + genome = null + accession = null + + println "${colorCodes.c_back_bright_red}WARNING: Organism '${organism_key}' not found in hosts table.${colorCodes.c_reset}" + println "${colorCodes.c_yellow}Returning null values for all outputs.${colorCodes.c_reset}" + + } +} \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/software_versions.nf similarity index 100% rename from Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf rename to Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/software_versions.nf diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config index 3ffd21694..8782bc71f 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config @@ -22,7 +22,14 @@ params { host = "human" hosts_table = "$projectDir/assets/hosts.csv" - ref_dbs_dir = null // Path to kraken2 database (or where it will be downloaded to if it's not set up yet), required + ref_dbs_dir = "./kraken2DBs" // Path to kraken2 database (or where it will be stored if it's not set up yet), required + + ref_fasta = "" // path or URL of custom reference genome fasta + assembly_name = "" // Assembly name for host reference genome (e.g. GRCh38.p14) + assembly_acc = "" // Assembly accession number for host reference genome (e.g. GCF_000001405.40) + + + db_url = "" // URL to download existing kraken2 database from reads_outdir = "${params.host}-removed-reads" // Output directory to hold -removed reads out_suffix = "_HRrm" @@ -99,14 +106,17 @@ process { container = "quay.io/biocontainers/kraken2:2.1.6--pl5321h077b44d_0" conda = "${projectDir}/envs/kraken2.yaml" } -} -workflow { - output { - mode = params.publish_dir_mode + withLabel: 'python' { + container = "quay.io/nasa_genelab/dp_tools:1.3.8" + conda = "${projectDir}/envs/dp_tools.yaml" } } +// Workflow output settings +outputDir = "." +workflow.output.mode = params.publish_dir_mode + timeline { enabled = true file = "nextflow_info/execution_timeline_${params.trace_timestamp}.html" diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json index 190627e5f..370e1eb85 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json @@ -46,7 +46,8 @@ }, "ref_dbs_dir": { "type": "string", - "description": "path to kraken2 databases" + "description": "path to kraken2 databases", + "default": "./kraken2DBs" }, "reads_outdir": { "type": "string", @@ -67,8 +68,19 @@ }, "trace_timestamp": { "type": "string", - "default": "2026-02-13_09-22-42", "hidden": true + }, + "ref_fasta": { + "type": "string" + }, + "assembly_name": { + "type": "string" + }, + "assembly_acc": { + "type": "string" + }, + "db_url": { + "type": "string" } }, "required": ["ref_dbs_dir", "sample_id_list", "reads_dir"] From c97eab24f994ff9e32a36d2c87b3ac0914263fd3 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Wed, 4 Mar 2026 11:48:39 -0800 Subject: [PATCH 3/9] Update workflow documentation to reflect changes --- .../NF_MGRemoveHostReads/README.md | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md index e786dae71..aebd8ef98 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md @@ -109,11 +109,16 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity This workflow can be run by providing the path to a text file containing a single-column list of unique sample identifiers, an example of which is shown [here](workflow_code/unique-sample-IDs.txt), along with the path to input data (raw reads of samples). -It also requires setting the root directory for where kraken2 reference databases are (or will be) stored. The workflow assumes databases follow the naming convention `kraken2--db`. If a database for a specified host is not found in the provided root directory, the workflow automatically builds one from scratch and saves it in the same directory using that name convention. +It also requires setting the root directory for where kraken2 reference databases are (or will be) stored. The workflow assumes databases follow the naming convention `kraken2--db` and determines how to obtain the database in the following order: -In cases where the workflow is to build kraken2 database from scratch, it is important to ensure that the host organism's details are present in hosts.csv table [here](workflow_code/assets/hosts.csv). If not, they should be appended to the table in the following format: `name,species,refseq_ID,genome_build,FASTA_URL`. +1. **Database already exists** in the provided root directory: the workflow reuses it directly, skipping the build step entirely. +2. **Pre-built database URL**: the workflow downloads and unpacks the database into the root directory. An example is available in the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Remove_host_reads/Workflow_Documentation/SW_MGRemoveHumanReads-A/reference-database-info.md), which describes how the human database was generated for a previous version of this workflow and how to obtain it for reuse. +3. **Custom FASTA file**: the workflow builds a database from scratch using the provided FASTA file (local path or URL). +4. **hosts.csv lookup**: the workflow parses the hosts table [here](workflow_code/assets/hosts.csv) to retrieve the FASTA URL and build the database from scratch. -Alternatively, a pre-built database can be manually downloaded and unpacked into the root directory, provided it follows the same naming convention. An example of which is available in the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Remove_host_reads/Workflow_Documentation/SW_MGRemoveHumanReads-A/reference-database-info.md), which describes how the human database was generated for a previous version of this workflow and how to obtain it for reuse. +For options 2 and 3, it is recommended that the user provide host genome assembly name and accession number as parameters for protocol documentation purposes. For option 4, if the host entry in hosts.csv has no FASTA URL specified, the workflow uses Kraken's `k2 download-library` to download sequences directly from NCBI, automatically capturing assembly details, and, therefore, no additional parameters are needed. If a FASTA URL is present in hosts.csv, the assembly name and accession are also picked up from the same table entry for protocol documentation. + +In all cases where the database is built from scratch, it is saved in the root directory using the `kraken2--db` naming convention for reuse in subsequent runs. > Note: Nextflow commands use both single hyphen arguments (e.g. -profile) that denote general Nextflow arguments and double hyphen arguments (e.g. --osd) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. @@ -125,7 +130,7 @@ Alternatively, a pre-built database can be manually downloaded and unpacked into nextflow run main.nf \ -resume \ -profile singularity \ - --ref_dbs_Dir \ + --ref_dbs_dir \ --sample_id_list unique_sample_ids.txt \ --reads_dir ``` @@ -145,7 +150,7 @@ nextflow run main.nf \ * `mamba` - instructs Nextflow to use conda environments via the mamba package manager * Other option (can be combined with the software environment option above using a comma, e.g. `-profile slurm,singularity`): * `slurm` - instructs Nextflow to use the [Slurm cluster management and job scheduling system](https://slurm.schedmd.com/overview.html) to schedule and run the jobs on a Slurm HPC cluster -* `--ref_dbs_Dir` - Specifies the path to the directory where kraken2 databases are or will be stored +* `--ref_dbs_dir` - Specifies the path to the directory where kraken2 databases are or will be stored (type: string, default: "./kraken2DBs") * `--sample_id_list` - path to a single-column file with unique sample identifiers (type: string, default: null) > *Note: An example of this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt).* * `--reads_dir` - path to raw reads directory (type: string, default: null) @@ -159,10 +164,22 @@ nextflow run main.nf \ * `--single_suffix` - raw reads suffix that follows the unique part of sample names (type: string, default: "_raw.fastq.gz") * `--R1_suffix` - raw forward reads suffix that follows the unique part of sample names (type: string, default: "_R1_raw.fastq.gz") * `--R2_suffix` - raw reverse reads suffix that follows the unique part of sample names (type: string, default: "_R2_raw.fastq.gz") -* `--single_out_suffix` - host-removed reads suffix that follows the unique part of sample names (type: string, default: "_HRremoved_raw.fastq.gz") -* `--R1_out_suffix` - host-removed forward reads suffix that follows the unique part of sample names (type: string, default: "_R1_HRremoved_raw.fastq.gz") -* `--R2_out_suffix` - host-removed reverse reads suffix that follows the unique part of sample names (type: string, default: "_R2_HRremoved_raw.fastq.gz") +* `--out_suffix` - host-removed reads suffix that follows the unique part of sample names (type: string, default: "_HRrm") * `--host` - host organism, should match the entry provided under `name` column in [hosts.csv](workflow_code/assets/hosts.csv) (type: string, default: "human") +* `--db_url` - URL to download a pre-built Kraken2 database (type: string, default: ""). If provided, `--ref_fasta` is not required. +* `--ref_fasta` - local path or URL of a custom reference genome FASTA file to build a Kraken2 database from (type: string, default: ""). If provided, `--assembly_name` and `--assembly_acc` are also recommended. +* `--assembly_name` - assembly name for the host reference genome, e.g. `GRCh38.p14` (type: string, default: ""). Required when using `--db_url` or `--ref_fasta` for protocol documentation. +* `--assembly_acc` - assembly accession number for the host reference genome, e.g. `GCF_000001405.40` (type: string, default: ""). Required when using `--db_url` or `--ref_fasta` for protocol documentation. + + +These parameters and additional optional arguments for the NF_MGRemoveHostReads workflow, including options that may not be immediately useful for most users, can be viewed by running the following command: + +```bash +nextflow run NF_MGRemoveHostReads_1.0.0/main.nf --help +``` + +See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows. +
From b9aab8effe7746ee59abe979e4ec00b7422684f7 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Wed, 4 Mar 2026 12:00:07 -0800 Subject: [PATCH 4/9] Remove legacy code from GENERATE_PROTOCOL process --- .../modules/generate_protocol.nf | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf index abeb97b48..d33bbc4c7 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf @@ -1,27 +1,3 @@ -/*process GENERATE_PROTOCOL { - - beforeScript "chmod +x ${projectDir}/bin/*" - tag "Generating analysis protocol text..." - - input: - tuple val(host), val(genome), val(assembly_acc) - path software_versions - path protocol - - output: - path("protocol.txt") - - script: - if (protocol.exists()) - """ - cp ${protocol} protocol.txt - """ - else - """ - generate_protocol.sh ${software_versions} ${host} "${assembly_acc}" ${genome} > protocol.txt - """ -}*/ - process GENERATE_PROTOCOL { tag "Generating analysis protocol text for ${host}" label 'python' From 238d2f5ef0347a520bfc883b15412e9b352d749f Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Wed, 4 Mar 2026 18:58:34 -0800 Subject: [PATCH 5/9] Add description to added params in nextflow_schema.json --- .../workflow_code/nextflow_schema.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json index 370e1eb85..73973d081 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json @@ -71,16 +71,20 @@ "hidden": true }, "ref_fasta": { - "type": "string" + "type": "string", + "description": "Specifies a custom reference FASTA file to use by providing a local path or a URL" }, "assembly_name": { - "type": "string" + "type": "string", + "description": "Specifies the assembly name for the host reference genome" }, "assembly_acc": { - "type": "string" + "type": "string", + "description": "Specifies the assembly accession number for the host reference genome" }, "db_url": { - "type": "string" + "type": "string", + "description": "Specifies a URL to download a pre-built Kraken2 database from" } }, "required": ["ref_dbs_dir", "sample_id_list", "reads_dir"] From 65461a182ec94a516af5b3746a1cbc7c53488538 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Fri, 13 Mar 2026 12:29:30 -0700 Subject: [PATCH 6/9] Update README links and add reference database documentation; modify hosts.csv and main.nf for improved workflow handling --- .../NF_MGRemoveHostReads/README.md | 2 +- .../reference-database-info.md | 45 +++++++++++++++++++ .../workflow_code/assets/hosts.csv | 6 +-- .../workflow_code/main.nf | 8 ++-- .../workflow_code/unique-sample-IDs.txt | 1 + 5 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/reference-database-info.md create mode 100644 Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/unique-sample-IDs.txt diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md index aebd8ef98..d39276a1d 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md @@ -112,7 +112,7 @@ This workflow can be run by providing the path to a text file containing a singl It also requires setting the root directory for where kraken2 reference databases are (or will be) stored. The workflow assumes databases follow the naming convention `kraken2--db` and determines how to obtain the database in the following order: 1. **Database already exists** in the provided root directory: the workflow reuses it directly, skipping the build step entirely. -2. **Pre-built database URL**: the workflow downloads and unpacks the database into the root directory. An example is available in the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Remove_host_reads/Workflow_Documentation/SW_MGRemoveHumanReads-A/reference-database-info.md), which describes how the human database was generated for a previous version of this workflow and how to obtain it for reuse. +2. **Pre-built database URL**: the workflow downloads and unpacks the database into the root directory. An example is available in the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/reference-database-info.md), which describes how the human database was generated for a previous version of this workflow and how to obtain it for reuse. 3. **Custom FASTA file**: the workflow builds a database from scratch using the provided FASTA file (local path or URL). 4. **hosts.csv lookup**: the workflow parses the hosts table [here](workflow_code/assets/hosts.csv) to retrieve the FASTA URL and build the database from scratch. diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/reference-database-info.md b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/reference-database-info.md new file mode 100644 index 000000000..c78bfbded --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/reference-database-info.md @@ -0,0 +1,45 @@ +# Reference database info +The database we use was built with kraken2 v2.1.1 as detailed below, and by default will be downloaded automatically the first time the workflow is run (it's ~4.3 GB uncompressed). + +--- + +## Kraken2 human database build + +> The following was performed on 29-Nov-2020 with kraken v2.1.1, which at that time pulled RefSeq human genome reference GCF_000001405.39, GRCh38.p13. + +**Download human reference (takes ~2 minutes as run here):** + +```bash +kraken2-build --download-library human --db kraken2-human-db --threads 30 --no-masking +``` + +**Download NCBI taxonomy info needed (takes ~10 minutes):** + +```bash +kraken2-build --download-taxonomy --db kraken2-human-db/ +``` + +**Build the database (takes ~20 minutes as run here):** + +```bash +kraken2-build --build --db kraken2-human-db/ --threads 30 +``` + +**Remove intermediate files:** + +```bash +kraken2-build --clean --db kraken2-human-db/ +``` + +--- + +## Download database as built on 29-Nov-2020 +The reference database is 3GB compressed and ~4.3GB uncompressed. If using the accompanying Snakemake workflow, it will download and use this reference database. It can also be downloaded and unpacked independently by running the following commands: + +```bash +curl -L -o kraken2-human-db.tar.gz https://ndownloader.figshare.com/files/25627058 + +tar -xzvf kraken2-human-db.tar.gz +``` + +--- diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv index 979befee2..c867e1791 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/assets/hosts.csv @@ -1,6 +1,6 @@ name,species,refseq,genome,fasta human,Homo sapiens mouse,Mus musculus,GCF_000001635.27,GRCm39,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz -arabidopsis,Arabidopsis thaliana,GCF_000001735.4,TAIR10,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz -lettuce,Lactuca sativa,GCF_002870075.4,Lsat_Salinas_v11,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/870/075/GCF_002870075.4_Lsat_Salinas_v11/GCF_002870075.4_Lsat_Salinas_v11_genomic.fna.gz -tomato,Solanum lycopersicum,GCF_036512215.1,SLM_r2.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/036/512/215/GCF_036512215.1_SLM_r2.1/GCF_036512215.1_SLM_r2.1_cds_from_genomic.fna.gz \ No newline at end of file +arabidopsis,Arabidopsis thaliana,GCF_000001735.4,TAIR10.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz +garden_lettuce,Lactuca sativa,GCF_002870075.5,Lsat_Salinas_v15,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/870/075/GCF_002870075.5_Lsat_Salinas_v15/GCF_002870075.5_Lsat_Salinas_v15_genomic.fna.gz +tomato,Solanum lycopersicum,GCF_036512215.1,SLM_r2.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/036/512/215/GCF_036512215.1_SLM_r2.1/GCF_036512215.1_SLM_r2.1_genomic.fna.gz \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf index 0fa36d463..f4eaaa3fa 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf @@ -35,7 +35,7 @@ workflow { } else if ( params.db_url ) { // Option 1: pre-built DB - only assembly_name and assembly_acc needed - reference_fasta = channel.value([]) + reference_fasta = channel.value("") reference_genome = params.assembly_name ? channel.value(params.assembly_name) : channel.value("unknown") reference_accession = params.assembly_acc ? channel.value(params.assembly_acc) : channel.value("unknown") @@ -78,7 +78,7 @@ workflow { // Generate summary and compile into one file SUMMARY(KRAKEN_2.out.output, KRAKEN_2.out.report) - COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), params.host) + COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), host_id) // Software Version Capturing - combining all captured software versions KRAKEN_2.out.version | mix(KRAKEN2_DB.out.version.ifEmpty("")) @@ -115,11 +115,11 @@ workflow { output { protocol_out { - path "processing_info" + path "GeneLab" } software_versions { - path "processing_info" + path "GeneLab" } fastq_out { diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/unique-sample-IDs.txt b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/unique-sample-IDs.txt new file mode 100644 index 000000000..2e7ba703d --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/unique-sample-IDs.txt @@ -0,0 +1 @@ +Sample-1 From 85ca41e45a2be2006d7b09577f3d02d32c12c789 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Mon, 16 Mar 2026 17:37:03 -0700 Subject: [PATCH 7/9] Updated documentation - updated pipeline doc and READMEs to specifically talk more about human read removal - updated approver list and date - updated READMes - switched to k2 download-taxonomy - updated kraken2 build db definitions --- Metagenomics/README.md | 2 +- .../GL-DPPD-7105-B.md | 90 +++++++++++-------- .../NF_MGRemoveHostReads/README.md | 4 +- 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/Metagenomics/README.md b/Metagenomics/README.md index e5f6d41f4..d50b54ecd 100644 --- a/Metagenomics/README.md +++ b/Metagenomics/README.md @@ -5,7 +5,7 @@ ## Select a specific pipeline for more info: * [Estimating host reads](Estimate_host_reads_in_raw_data) -* [Removing host reads](Remove_host_reads) +* [Removing human or host reads](Remove_host_reads) * [Illumina](Illumina)
diff --git a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md index e58e8c05b..e02d71bf9 100644 --- a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md +++ b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md @@ -4,7 +4,7 @@ --- -**Date:** November X, 2025 + **Date:** March X, 2026 **Revision:** B **Document Number:** GL-DPPD-7105-B @@ -12,10 +12,9 @@ Jihan Yehia (GeneLab Data Processing Team) **Approved by:** -Samrawit Gebre (OSDR Project Manager) +Jonathan Galazka (OSDR Project Manager) Danielle Lopez (OSDR Deputy Project Manager) -Jonathan Galazka (OSDR Project Scientist) -Amanda Saravia-Butler (GeneLab Science Lead) +Amanda Saravia-Butler (OSDR Subject Matter Expert) Barbara Novak (GeneLab Data Processing Lead) ## Updates from previous revision @@ -40,42 +39,58 @@ Barbara Novak (GeneLab Data Processing Lead) |Program|Version*|Relevant Links| |:------|:-----:|------:| -|kraken2|`kraken2 -v`|[https://github.com/DerrickWood/kraken2/wiki](https://github.com/DerrickWood/kraken2/wiki)| +|kraken2| 2.1.6 |[https://github.com/DerrickWood/kraken2/wiki](https://github.com/DerrickWood/kraken2/wiki)| -> \* Exact versions utilized for a given dataset are available along with the processing commands for each specific dataset (this is due to how the system may need to be updated regularly). +> \* Exact version utilized for a given dataset are available along with the processing commands for each specific dataset (this is due to how the system may need to be updated regularly). --- # General processing overview with example commands -> Output files listed in **bold** below are included with each Metagenomics dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). +> Output files listed in **bold** below are included with each Metagenomics dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). +> **Note:** the examples provided are for human read removal. For the more generic host read removal, please refer to the Workflow documentation. -## 1. Build kraken2 database -This depends on the appropriate host genome. This example is done with the human genome ([GRCh38.p13 | GCF_000001405.39](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39)). -> **Note:** It is recommended to use NCBI with kraken2 because sequences not downloaded from NCBI may require explicit assignment of taxonomy information before they can be used to build the database, as mentioned [here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown). -```bash -# downloading and decompressing reference genome -wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz -gunzip GCF_000001405.39_GRCh38.p13_genomic.fna.gz +### 1. Build kraken2 database +For human read removal, building the database relies on kraken2 downloading the sequences from NCBI, +which may include multiple different versions of the genome (for example, both [GRCh38.p14](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.40) +and [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/GCF_009914755.1)). To use this pipeline for +read removal of hosts other than human, download the host fasta file and add it to the kraken2 database +using the [`kraken2-build --add-to-library` function as described in the [kraken2 documentation](https://github.com/DerrickWood/kraken2/wiki/Manual#custom-databases) or +in the NASA GeneLab [Estimate Host Reads pipeline](../../Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/) + +> **Note:** It is recommended to use NCBI genome files with kraken2 because sequences not downloaded from +NCBI may require explicit assignment of taxonomy information before they can be used to build the +database, as mentioned in the [Kraken2 Documentation](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown). +```bash +# download human fasta sequences +kraken2-build --download-library human --db kraken2-human-db/ --threads 30 --no-masking -# building kraken2 database +# Download NCBI taxonomic information k2 download-taxonomy --db kraken2-human-db/ -kraken2-build --add-to-library GCF_000001405.39_GRCh38.p13_genomic.fna --no-masking --db kraken2-human-db/ -kraken2-build --build --db kraken2-human-db/ --threads 30 --no-masking + +# Build the database +kraken2-build --build --db kraken2-human-db/ --kmer-len 35 --minimizer-len 31 --threads 30 + +# Clean up intermediate files kraken2-build --clean --db kraken2-human-db/ ``` **Parameter Definitions:** - -* `download-taxonomy` - downloads taxonomic mapping information via [k2 wrapper script](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) -* `--add-to-library` - adds the fasta file to the library of sequences being included -* `--db` - specifies the directory we are putting the database in -* `--threads` - specifies the number of threads to use -* `--no-masking` - prevents [masking](https://github.com/DerrickWood/kraken2/wiki/Manual#masking-of-low-complexity-sequences) of low-complexity sequences -* `--build` - specifies to construct kraken2-formatted database -* `--clean` - specifies to remove unnecessarily intermediate files +*kraken2-build* +- `--download-library` - specifies the references to download (here the human reference genome) + - `--no-masking` - Disables masking of low-complexity sequences. For additional + information see the [kraken documentation for masking](https://github.com/DerrickWood/kraken2/wiki/Manual#masking-of-low-complexity-sequences). +- `--db` - Specifies the name of the directory for the kraken2 database +- `--build` - Instructs kraken2-build to build the kraken2 DB from the library files + - `--kmer-len` - K-mer length in bp (default: 35). + - `--minimizer-len` - Minimizer length in bp (default: 31) +- `--clean` - Instructs kraken2-build to remove unneeded intermediate files. + +*k2* +- `download-taxonomy` - downloads taxonomic mapping information via [k2 wrapper script](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) +- `--db` - Specifies the name of the directory for the kraken2 database **Input data:** @@ -83,12 +98,14 @@ kraken2-build --clean --db kraken2-human-db/ **Output data:** -* kraken2 database files (hash.k2d, opts.k2d, and taxo.k2d) +- kraken2_human_db/ (Kraken2 human database directory, containing hash.k2d, opts.k2d, and taxo.k2d files) --- ## 2. Filter out human-classified reads +>**Note:** HRrm is the file suffix NASA GeneLab uses for all files with human reads removed. If host reads are removed, the suffix is updated to "HostRm" instead. + ### Example if paired-end reads ```bash @@ -97,8 +114,8 @@ kraken2 --db kraken2-human-db --gzip-compressed --threads 4 --use-names --paired --unclassified-out sample-1_R#.fastq sample-1-R1.fq.gz sample-1-R2.fq.gz # renaming and gzipping output files -mv sample-1_R_1.fastq sample-1_R1_HRremoved_raw.fastq && gzip sample-1_R1_HRremoved_raw.fastq -mv sample-1_R_2.fastq sample-1_R2_HRremoved_raw.fastq && gzip sample-1_R2_HRremoved_raw.fastq +mv sample-1_R_1.fastq sample-1_R1_HRrm.fastq && gzip sample-1_R1_HRrm.fastq +mv sample-1_R_2.fastq sample-1_R2_HRrm.fastq && gzip sample-1_R2_HRrm.fastq ``` **Parameter Definitions:** @@ -115,6 +132,7 @@ mv sample-1_R_2.fastq sample-1_R2_HRremoved_raw.fastq && gzip sample-1_R2_HRremo **Input data:** +* kraken2-human-db (Kraken2 human database directory, from [Step 1](#1-build-kraken2-database)) * sample-1-R1.fq.gz (gzipped forward-reads fastq file) * sample-1-R2.fq.gz (gzipped reverse-reads fastq file) @@ -122,18 +140,18 @@ mv sample-1_R_2.fastq sample-1_R2_HRremoved_raw.fastq && gzip sample-1_R2_HRremo * sample-1-kraken2-output.txt (kraken2 read-based output file (one line per read)) * sample-1-kraken2-report.tsv (kraken2 report output file (one line per taxa, with number of reads assigned to it)) -* **sample-1_R1_HRremoved_raw.fastq.gz** (host-read removed, gzipped forward-reads fastq file) -* **sample-1_R2_HRremoved_raw.fastq.gz** (host-read removed, gzipped reverse-reads fastq file) +* **sample-1_R1_HRrm.fastq.gz** (human-read removed, gzipped forward-reads fastq file) +* **sample-1_R2_HRrm.fastq.gz** (human-read removed, gzipped reverse-reads fastq file) ### Example if single-end reads ```bash kraken2 --db kraken2-human-db --gzip-compressed --threads 4 --use-names \ --output sample-1-kraken2-output.txt --report sample-1-kraken2-report.tsv \ - --unclassified-out sample-1_HRremoved_raw.fastq sample-1.fq.gz + --unclassified-out sample-1_HRrm.fastq sample-1.fq.gz # gzipping output file -gzip sample-1_HRremoved_raw.fastq +gzip sample-1_HRrm.fastq ``` **Parameter Definitions:** @@ -155,13 +173,15 @@ gzip sample-1_HRremoved_raw.fastq * sample-1-kraken2-output.txt (kraken2 read-based output file (one line per read)) * sample-1-kraken2-report.tsv (kraken2 report output file (one line per taxa, with number of reads assigned to it)) -* **sample-1_HRremoved_raw.fastq.gz** (host-read removed, gzipped reads fastq file) +* **sample-1_HRrm.fastq.gz** (human-read removed, gzipped reads fastq file) --- ## 3. Generate a kraken2 summary report Utilizes a Unix-like command-line. +>**Note:** In the case of host removal rather than human removal, Update the script below to change "human" to "host". + ```bash total_fragments=$(wc -l sample-1-kraken2-output.txt | sed 's/^ *//' | cut -f 1 -d " ") @@ -169,7 +189,7 @@ fragments_retained=$(grep -w -m 1 "unclassified" sample-1-kraken2-report.tsv | c perc_removed=$(printf "%.2f\n" $(echo "scale=4; 100 - ${fragments_retained} / ${total_fragments} * 100" | bc -l)) -cat <( printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_host_reads_removed\n" ) \ +cat <( printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_human_reads_removed\n" ) \ <( printf "Sample-1\t${total_fragments}\t${fragments_retained}\t${perc_removed}\n" ) > Human-read-removal-summary.tsv ``` @@ -180,7 +200,7 @@ cat <( printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent **Output data:** -* -read-removal-summary.tsv (a tab-separated file with 4 columns: "Sample_ID", "Total_fragments_before", "Total_fragments_after", "Percent_host_reads_removed") +* Human-read-removal-summary.tsv (a tab-separated file with 4 columns: "Sample_ID", "Total_fragments_before", "Total_fragments_after", "Percent_human_reads_removed") * *Note: The percent human reads removed from each sample is provided in the assay table on the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).* --- diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md index d39276a1d..58cae0bfc 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/README.md @@ -65,7 +65,7 @@ We recommend installing Singularity on a system wide level as per the associated ### 2. Download the Workflow Files -All files required for utilizing the NF_MGRemoveHostReads GeneLab workflow for removing host reads from metagenomics sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest *NF_MGRemoveHostReads* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: +All files required for utilizing the NF_MGRemoveHostReads GeneLab workflow for removing human or host reads from metagenomics sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest *NF_MGRemoveHostReads* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash wget @@ -164,7 +164,7 @@ nextflow run main.nf \ * `--single_suffix` - raw reads suffix that follows the unique part of sample names (type: string, default: "_raw.fastq.gz") * `--R1_suffix` - raw forward reads suffix that follows the unique part of sample names (type: string, default: "_R1_raw.fastq.gz") * `--R2_suffix` - raw reverse reads suffix that follows the unique part of sample names (type: string, default: "_R2_raw.fastq.gz") -* `--out_suffix` - host-removed reads suffix that follows the unique part of sample names (type: string, default: "_HRrm") +* `--out_suffix` - human- or host-removed reads suffix that follows the unique part of sample names (type: string, default: "_HRrm" for human-removed, or "_HostRm" for host-removed) * `--host` - host organism, should match the entry provided under `name` column in [hosts.csv](workflow_code/assets/hosts.csv) (type: string, default: "human") * `--db_url` - URL to download a pre-built Kraken2 database (type: string, default: ""). If provided, `--ref_fasta` is not required. * `--ref_fasta` - local path or URL of a custom reference genome FASTA file to build a Kraken2 database from (type: string, default: ""). If provided, `--assembly_name` and `--assembly_acc` are also recommended. From c492bf68878c86ab36a67c423bdbbbd16e1ca2c1 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Mon, 16 Mar 2026 18:05:55 -0700 Subject: [PATCH 8/9] Update documentation to use k2 instead of kraken2-build in building-database commands --- .../GL-DPPD-7105-B.md | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md index e02d71bf9..b74490697 100644 --- a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md +++ b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md @@ -19,7 +19,7 @@ Barbara Novak (GeneLab Data Processing Lead) ## Updates from previous revision * Updated kraken2 from version 2.1.1 to 2.1.6 -* In [Step 1](#1-build-kraken2-database), used kraken2's `k2` wrapper script for `download-taxonomy` because the script supports HTTPS download as mentioned [here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) +* In [Step 1](#1-build-kraken2-database), replaced direct `kraken2-build` calls with kraken2's `k2` wrapper script, which provides a higher-level interface for database construction including HTTPS download support, as described [here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) --- @@ -56,7 +56,7 @@ For human read removal, building the database relies on kraken2 downloading the which may include multiple different versions of the genome (for example, both [GRCh38.p14](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.40) and [T2T-CHM13v2.0](https://www.ncbi.nlm.nih.gov/assembly/GCF_009914755.1)). To use this pipeline for read removal of hosts other than human, download the host fasta file and add it to the kraken2 database -using the [`kraken2-build --add-to-library` function as described in the [kraken2 documentation](https://github.com/DerrickWood/kraken2/wiki/Manual#custom-databases) or +using the [`k2 add-to-library` function as described in the [kraken2 documentation](https://github.com/DerrickWood/kraken2/wiki/Manual#add-to-library) or in the NASA GeneLab [Estimate Host Reads pipeline](../../Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/) > **Note:** It is recommended to use NCBI genome files with kraken2 because sequences not downloaded from @@ -65,32 +65,31 @@ database, as mentioned in the [Kraken2 Documentation](https://github.com/Derrick ```bash # download human fasta sequences -kraken2-build --download-library human --db kraken2-human-db/ --threads 30 --no-masking +k2 download-library --db kraken2-human-db/ --library human --threads 30 --no-masking # Download NCBI taxonomic information k2 download-taxonomy --db kraken2-human-db/ # Build the database -kraken2-build --build --db kraken2-human-db/ --kmer-len 35 --minimizer-len 31 --threads 30 +k2 build --db kraken2-human-db/ --kmer-len 35 --minimizer-len 31 --threads 30 # Clean up intermediate files -kraken2-build --clean --db kraken2-human-db/ +k2 clean --db kraken2-human-db/ ``` **Parameter Definitions:** -*kraken2-build* -- `--download-library` - specifies the references to download (here the human reference genome) - - `--no-masking` - Disables masking of low-complexity sequences. For additional + +- `download-library` - downloads the references-containing library + - `--library` - specifies the library to download (here the human reference genome) + - `--no-masking` - disables masking of low-complexity sequences. For additional information see the [kraken documentation for masking](https://github.com/DerrickWood/kraken2/wiki/Manual#masking-of-low-complexity-sequences). -- `--db` - Specifies the name of the directory for the kraken2 database -- `--build` - Instructs kraken2-build to build the kraken2 DB from the library files - - `--kmer-len` - K-mer length in bp (default: 35). - - `--minimizer-len` - Minimizer length in bp (default: 31) -- `--clean` - Instructs kraken2-build to remove unneeded intermediate files. - -*k2* -- `download-taxonomy` - downloads taxonomic mapping information via [k2 wrapper script](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) -- `--db` - Specifies the name of the directory for the kraken2 database +- `--db` - specifies the name of the directory for the kraken2 database +- `--threads` - specifies the number of threads to use +- `download-taxonomy` - downloads taxonomic mapping information +- `build` - builds a kraken2-formatted database from the library files + - `--kmer-len` - k-mer length in bp (default: 35). + - `--minimizer-len` - minimizer length in bp (default: 31) +- `clean` - removes unneeded intermediate files **Input data:** From 4c087dfdc26fd974627483b527186053e41693e2 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Mon, 16 Mar 2026 18:13:59 -0700 Subject: [PATCH 9/9] Fix formatting --- .../Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md index b74490697..a1779c646 100644 --- a/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md +++ b/Metagenomics/Remove_host_reads/Pipeline_GL-DPPD-7105_Versions/GL-DPPD-7105-B.md @@ -5,7 +5,7 @@ --- **Date:** March X, 2026 -**Revision:** B +**Revision:** B **Document Number:** GL-DPPD-7105-B **Submitted by:**