From 22bff2c5cc5a5ab3c35d0d96ad96ad6f29d20e05 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:03:36 +0100 Subject: [PATCH 1/9] added FRMT_ONE which stands for one-based inclusive --- src_2.0/nonltr/Scanner.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src_2.0/nonltr/Scanner.h b/src_2.0/nonltr/Scanner.h index e5ecb60..60c3ce4 100644 --- a/src_2.0/nonltr/Scanner.h +++ b/src_2.0/nonltr/Scanner.h @@ -54,6 +54,7 @@ class Scanner { public: static const int FRMT_POS = 1; static const int FRMT_BED = 2; + static const int FRMT_ONE = 3; Scanner(HMM *, int, ChromosomeOneDigit *, string); Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *); From c5f653fe9d1846ef3eb29dc63c0283ddbf56edf2 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:13:49 +0100 Subject: [PATCH 2/9] added -frm 3 (FRM_ONE), which is one-based inclusive --- src_2.0/RepeatsDetector.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp index 15c1937..d34d94a 100644 --- a/src_2.0/RepeatsDetector.cpp +++ b/src_2.0/RepeatsDetector.cpp @@ -230,6 +230,8 @@ void drive(map * const param) { string ext(".rpt"); if (atoi(param->at(FRM_PRM).c_str()) == 2) { ext = string(".bed"); + } else if (atoi(param->at(FRM_PRM).c_str()) == 3) { + ext = string(".tsv") } string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ext; @@ -354,9 +356,11 @@ int main(int argc, char * argv[]) { message.append("\t\tMasked sequences files have the \".msk\" extension.\n"); message.append( - "\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n"); + "\t-frm the format of the output: 1 (chrName:start-end), 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); message.append( - "\t\tThe output format are zero based and the end is exclusive.\n"); + "\t\tOutput formats 1 & 2 are zero-based, end exclusive.\n"); + message.append( + "\t\tOutput format 3 is one-based, end inclusive (Ensembl).\n"); message.append("\t-hmo file where the HMM is saved, optional.\n"); message.append("\t-cor integer of the number of threads, optional.\n"); message.append("\t\tThe more threads, the higher the memory requirement.\n"); @@ -487,10 +491,12 @@ int main(int argc, char * argv[]) { Util::int2string(Scanner::FRMT_POS))); } else { if (atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_POS - && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED) { + && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED + && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_ONE) { cerr << "The output format must be " << Scanner::FRMT_POS << " or "; - cerr << Scanner::FRMT_BED << ". The format received is "; + cerr << Scanner::FRMT_BED << " or "; + cerr << Scanner::FRMT_ONE << ". The format received is "; cerr << param->at(FRM_PRM) << "." << endl; return 1; } From 10f54c1c7d743fe990401a016c6df38ec2e41ff3 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:17:08 +0100 Subject: [PATCH 3/9] added FRMT_ONE which stands for one-based inclusive --- src_2.0/nonltr/Scanner.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src_2.0/nonltr/Scanner.cpp b/src_2.0/nonltr/Scanner.cpp index b235914..d73450d 100644 --- a/src_2.0/nonltr/Scanner.cpp +++ b/src_2.0/nonltr/Scanner.cpp @@ -299,13 +299,15 @@ void Scanner::printScores(string outputFile, bool canAppend) { void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { - if(frmt != FRMT_POS && frmt != FRMT_BED){ + if(frmt != FRMT_POS && frmt != FRMT_BED && frmt != FRMT_ONE){ string msg("Unknown output format: "); msg.append(Util::int2string(frmt)); msg.append(". The known formats are: "); msg.append(Util::int2string(FRMT_POS)); msg.append(" and "); msg.append(Util::int2string(FRMT_BED)); + msg.append(" and "); + msg.append(Util::int2string(FRMT_ONE)); msg.append("."); throw InvalidInputException(msg); } @@ -334,7 +336,14 @@ void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { outIndex << ((int) (regionList->at(j)->getEnd() + 1)); outIndex << endl; } - } + } else if(frmt == FRMT_ONE){ + for (unsigned int j = 0; j < regionList->size(); j++) { + outIndex << header.substr(1) << "\t"; + outIndex << ((int) (regionList->at(j)->getStart() +1)) << "\t"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)); + outIndex << endl; + } + } outIndex.close(); } From ac90a059cb04fb0d8af019e996e914e9b5720616 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:21:46 +0100 Subject: [PATCH 4/9] added -frm 3 (FRM_ONE), which is one-based inclusive --- src_2.0/RepeatsDetector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp index d34d94a..7cd12b0 100644 --- a/src_2.0/RepeatsDetector.cpp +++ b/src_2.0/RepeatsDetector.cpp @@ -231,7 +231,7 @@ void drive(map * const param) { if (atoi(param->at(FRM_PRM).c_str()) == 2) { ext = string(".bed"); } else if (atoi(param->at(FRM_PRM).c_str()) == 3) { - ext = string(".tsv") + ext = string(".tsv"); } string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ext; From 8814d14f2a5aa12bfe65d4961ec6892b4fc387ed Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:33:47 +0100 Subject: [PATCH 5/9] improved command-line format description --- src_2.0/RepeatsDetector.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp index 7cd12b0..ce1c784 100644 --- a/src_2.0/RepeatsDetector.cpp +++ b/src_2.0/RepeatsDetector.cpp @@ -356,7 +356,9 @@ int main(int argc, char * argv[]) { message.append("\t\tMasked sequences files have the \".msk\" extension.\n"); message.append( - "\t-frm the format of the output: 1 (chrName:start-end), 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); + "\t-frm the format of the output: 1 (chrName:start-end), \n"); + message.append( + "\t 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); message.append( "\t\tOutput formats 1 & 2 are zero-based, end exclusive.\n"); message.append( From dbf8fa44e384264f67b591e85cab353b7bbbcd07 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 9 Sep 2020 09:55:55 +0100 Subject: [PATCH 6/9] fixed indentation --- src_2.0/RepeatsDetector.cpp | 4 ++-- src_2.0/nonltr/Scanner.cpp | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp index ce1c784..f61972e 100644 --- a/src_2.0/RepeatsDetector.cpp +++ b/src_2.0/RepeatsDetector.cpp @@ -358,11 +358,11 @@ int main(int argc, char * argv[]) { message.append( "\t-frm the format of the output: 1 (chrName:start-end), \n"); message.append( - "\t 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); + "\t 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); message.append( "\t\tOutput formats 1 & 2 are zero-based, end exclusive.\n"); message.append( - "\t\tOutput format 3 is one-based, end inclusive (Ensembl).\n"); + "\t\tOutput format 3 is one-based, end inclusive (Ensembl).\n"); message.append("\t-hmo file where the HMM is saved, optional.\n"); message.append("\t-cor integer of the number of threads, optional.\n"); message.append("\t\tThe more threads, the higher the memory requirement.\n"); diff --git a/src_2.0/nonltr/Scanner.cpp b/src_2.0/nonltr/Scanner.cpp index d73450d..f55bdf6 100644 --- a/src_2.0/nonltr/Scanner.cpp +++ b/src_2.0/nonltr/Scanner.cpp @@ -337,13 +337,13 @@ void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { outIndex << endl; } } else if(frmt == FRMT_ONE){ - for (unsigned int j = 0; j < regionList->size(); j++) { - outIndex << header.substr(1) << "\t"; - outIndex << ((int) (regionList->at(j)->getStart() +1)) << "\t"; - outIndex << ((int) (regionList->at(j)->getEnd() + 1)); - outIndex << endl; - } - } + for (unsigned int j = 0; j < regionList->size(); j++) { + outIndex << header.substr(1) << "\t"; + outIndex << ((int) (regionList->at(j)->getStart() +1)) << "\t"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)); + outIndex << endl; + } + } outIndex.close(); } From 19264e696a8dd1bfc28763c7743d166c61c43243 Mon Sep 17 00:00:00 2001 From: Colin Davenport Date: Fri, 12 Apr 2024 10:57:30 +0200 Subject: [PATCH 7/9] add compile and usage docs, use g++9 --- README.md | 95 ++++++++++++++++++++++++++++++++++++++++++++---- src_2.0/Makefile | 3 +- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 20b1feb..1969d24 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,100 @@ # Red Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. +This fork of the original at https://github.com/BioinformaticsToolsmith/Red adds usability information and allows compilation with g++9 (eg on Ubuntu 2004) + + +## Compiling the source code + Requirement: GNU gcc8.2 or higher. Please change the name (CXX) of the compiler in the Makefile. - -Compiling the source code -The following command makes the required directories: -> make bin +``` +# get the code +git clone https://github.com/brunocontrerasmoreira/Red + +cd Red && cd src_2.0 + +# The following command makes the required directories: +make bin + +# The following command makes the binary that is located under the ``bin'' directory: +make -j 4 + +# To find the binary: +cd ../bin + +# Run the binary +./Red +``` + +## Example command + +# only genomes in .fa format are detected properly +ln -s genome.fasta input.fa + +# out dirs must all exist else core dump +mkdir -p out_mask output -The following command makes the binary that is located under the ``bin'' directory: -> make +# Run Red with an input.fa in the current dir . +Red -gnm . -msk out_mask -rpt output + + + +## Full usage + +``` +This is Red (REpeat Detector) designed and developed by Hani Zakaria Girgis, PhD. + +Version: 2.0 + +Argument pairs of the form: -flag value are required. +Valid argument pairs: + -gnm input genome directory, required. + Files with ".fa" extension in this directory are used for completing the table of the adjusted counts. + These Files are scanned for repeats. + -dir directory including additional input sequences, optional. + Files with ".fa" extension in this directory are NOT used for completing the table. + These Files MUST have different names from those in the genome directory. + These Files are scanned for repeats. + -len word length equals k defining the k-mer. The default is floor(log_4(genome size)). + -ord order of the background Markov chain. The default is floor(k/2)-1. + -gau half width of the mask. The default is based on the GC content. + 20 if the GC content > 33% and < 67%, 40 otherwise. + -thr the threshold score of the low adjusted scores of non-repeats. The default is 2. + -min the minimum number of the observed k-mers. The default is 3. + -tbl file where the table of the adjusted counts is written, optional. + -sco directory where scores are saved, optional. + Score files have the ".scr" extension. + -cnd directory where candidate regions are saved, optional. + Candidates files have the ".cnd" extension. + -rpt directory where repeats locations are saved, optional. + Repeats files have the ".rpt" extension. + -msk directory where masked sequences are saved, optional. + Masked sequences files have the ".msk" extension. + -frm the format of the output: 1 (chrName:start-end), + 2 (chrName start end) or 3 (chrName start end). + Output formats 1 & 2 are zero-based, end exclusive. + Output format 3 is one-based, end inclusive (Ensembl). + -hmo file where the HMM is saved, optional. + -cor integer of the number of threads, optional. + The more threads, the higher the memory requirement. + The defaul is the number of cores - 1, or 1 if single core is found. + +Examples: + The following command runs Red with the defaults and generates the masked sequences. + Red -gnm genome_directory -msk output_directory + + The following command runs Red with the defaults and generates the masked sequences and the locations of repeats. + Red -gnm genome_directory -msk output_directory -rpt output_directory +``` -To find the binary: -> cd ../bin Please cite the following paper: Girgis, H.Z. (2015) Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. BMC Bioinformatics, 16, 227. + +Original Repo + +https://github.com/BioinformaticsToolsmith/Red \ No newline at end of file diff --git a/src_2.0/Makefile b/src_2.0/Makefile index 9a62b76..71e25d7 100644 --- a/src_2.0/Makefile +++ b/src_2.0/Makefile @@ -2,7 +2,8 @@ # CXX = /usr/bin/c++ # CXX = /usr/bin/g++ -CXX = g++-8 +#CXX = g++-8 +CXX = g++-9 CXXFLAGS = -std=c++14 -fopenmp -O3 -g -fmessage-length=0 -Wall -fpermissive From 39475f374fed06569c5d4fdc236f4e45b2350c1d Mon Sep 17 00:00:00 2001 From: Colin Davenport Date: Fri, 12 Apr 2024 10:58:27 +0200 Subject: [PATCH 8/9] docs --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1969d24..028954a 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ cd ../bin ## Example command +``` # only genomes in .fa format are detected properly ln -s genome.fasta input.fa @@ -37,7 +38,7 @@ mkdir -p out_mask output # Run Red with an input.fa in the current dir . Red -gnm . -msk out_mask -rpt output - +``` ## Full usage From 3272a4ee9b34e270bdfa52a94e7a7b2b89d1c45c Mon Sep 17 00:00:00 2001 From: Colin Davenport Date: Fri, 12 Apr 2024 11:16:35 +0200 Subject: [PATCH 9/9] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 028954a..ccd320b 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,8 @@ cd ../bin ## Example command ``` -# only genomes in .fa format are detected properly -ln -s genome.fasta input.fa +# only genomes in .fa format are detected properly - and will be DELETED by Red +cp /mnt/genome.fasta input.fa # out dirs must all exist else core dump mkdir -p out_mask output @@ -40,6 +40,8 @@ mkdir -p out_mask output Red -gnm . -msk out_mask -rpt output ``` +**Beware: note that .fa files in the genome directory will be deleted by Red - so only copy them in. You will have to copy the genome in again before the next run** + ## Full usage @@ -98,4 +100,4 @@ detecting repeats de-novo on the genomic scale. BMC Bioinformatics, Original Repo -https://github.com/BioinformaticsToolsmith/Red \ No newline at end of file +https://github.com/BioinformaticsToolsmith/Red