diff --git a/README.md b/README.md index 20b1feb..ccd320b 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,103 @@ # Red Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. +This fork of the original at https://github.com/BioinformaticsToolsmith/Red adds usability information and allows compilation with g++9 (eg on Ubuntu 2004) + + +## Compiling the source code + Requirement: GNU gcc8.2 or higher. Please change the name (CXX) of the compiler in the Makefile. - -Compiling the source code -The following command makes the required directories: -> make bin +``` +# get the code +git clone https://github.com/brunocontrerasmoreira/Red + +cd Red && cd src_2.0 + +# The following command makes the required directories: +make bin + +# The following command makes the binary that is located under the ``bin'' directory: +make -j 4 + +# To find the binary: +cd ../bin + +# Run the binary +./Red +``` + +## Example command + +``` +# only genomes in .fa format are detected properly - and will be DELETED by Red +cp /mnt/genome.fasta input.fa + +# out dirs must all exist else core dump +mkdir -p out_mask output -The following command makes the binary that is located under the ``bin'' directory: -> make +# Run Red with an input.fa in the current dir . +Red -gnm . -msk out_mask -rpt output +``` + +**Beware: note that .fa files in the genome directory will be deleted by Red - so only copy them in. You will have to copy the genome in again before the next run** + + +## Full usage + +``` +This is Red (REpeat Detector) designed and developed by Hani Zakaria Girgis, PhD. + +Version: 2.0 + +Argument pairs of the form: -flag value are required. +Valid argument pairs: + -gnm input genome directory, required. + Files with ".fa" extension in this directory are used for completing the table of the adjusted counts. + These Files are scanned for repeats. + -dir directory including additional input sequences, optional. + Files with ".fa" extension in this directory are NOT used for completing the table. + These Files MUST have different names from those in the genome directory. + These Files are scanned for repeats. + -len word length equals k defining the k-mer. The default is floor(log_4(genome size)). + -ord order of the background Markov chain. The default is floor(k/2)-1. + -gau half width of the mask. The default is based on the GC content. + 20 if the GC content > 33% and < 67%, 40 otherwise. + -thr the threshold score of the low adjusted scores of non-repeats. The default is 2. + -min the minimum number of the observed k-mers. The default is 3. + -tbl file where the table of the adjusted counts is written, optional. + -sco directory where scores are saved, optional. + Score files have the ".scr" extension. + -cnd directory where candidate regions are saved, optional. + Candidates files have the ".cnd" extension. + -rpt directory where repeats locations are saved, optional. + Repeats files have the ".rpt" extension. + -msk directory where masked sequences are saved, optional. + Masked sequences files have the ".msk" extension. + -frm the format of the output: 1 (chrName:start-end), + 2 (chrName start end) or 3 (chrName start end). + Output formats 1 & 2 are zero-based, end exclusive. + Output format 3 is one-based, end inclusive (Ensembl). + -hmo file where the HMM is saved, optional. + -cor integer of the number of threads, optional. + The more threads, the higher the memory requirement. + The defaul is the number of cores - 1, or 1 if single core is found. + +Examples: + The following command runs Red with the defaults and generates the masked sequences. + Red -gnm genome_directory -msk output_directory + + The following command runs Red with the defaults and generates the masked sequences and the locations of repeats. + Red -gnm genome_directory -msk output_directory -rpt output_directory +``` -To find the binary: -> cd ../bin Please cite the following paper: Girgis, H.Z. (2015) Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. BMC Bioinformatics, 16, 227. + +Original Repo + +https://github.com/BioinformaticsToolsmith/Red diff --git a/src_2.0/Makefile b/src_2.0/Makefile index 9a62b76..71e25d7 100644 --- a/src_2.0/Makefile +++ b/src_2.0/Makefile @@ -2,7 +2,8 @@ # CXX = /usr/bin/c++ # CXX = /usr/bin/g++ -CXX = g++-8 +#CXX = g++-8 +CXX = g++-9 CXXFLAGS = -std=c++14 -fopenmp -O3 -g -fmessage-length=0 -Wall -fpermissive diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp index 15c1937..f61972e 100644 --- a/src_2.0/RepeatsDetector.cpp +++ b/src_2.0/RepeatsDetector.cpp @@ -230,6 +230,8 @@ void drive(map * const param) { string ext(".rpt"); if (atoi(param->at(FRM_PRM).c_str()) == 2) { ext = string(".bed"); + } else if (atoi(param->at(FRM_PRM).c_str()) == 3) { + ext = string(".tsv"); } string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ext; @@ -354,9 +356,13 @@ int main(int argc, char * argv[]) { message.append("\t\tMasked sequences files have the \".msk\" extension.\n"); message.append( - "\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n"); + "\t-frm the format of the output: 1 (chrName:start-end), \n"); message.append( - "\t\tThe output format are zero based and the end is exclusive.\n"); + "\t 2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n"); + message.append( + "\t\tOutput formats 1 & 2 are zero-based, end exclusive.\n"); + message.append( + "\t\tOutput format 3 is one-based, end inclusive (Ensembl).\n"); message.append("\t-hmo file where the HMM is saved, optional.\n"); message.append("\t-cor integer of the number of threads, optional.\n"); message.append("\t\tThe more threads, the higher the memory requirement.\n"); @@ -487,10 +493,12 @@ int main(int argc, char * argv[]) { Util::int2string(Scanner::FRMT_POS))); } else { if (atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_POS - && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED) { + && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED + && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_ONE) { cerr << "The output format must be " << Scanner::FRMT_POS << " or "; - cerr << Scanner::FRMT_BED << ". The format received is "; + cerr << Scanner::FRMT_BED << " or "; + cerr << Scanner::FRMT_ONE << ". The format received is "; cerr << param->at(FRM_PRM) << "." << endl; return 1; } diff --git a/src_2.0/nonltr/Scanner.cpp b/src_2.0/nonltr/Scanner.cpp index b235914..f55bdf6 100644 --- a/src_2.0/nonltr/Scanner.cpp +++ b/src_2.0/nonltr/Scanner.cpp @@ -299,13 +299,15 @@ void Scanner::printScores(string outputFile, bool canAppend) { void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { - if(frmt != FRMT_POS && frmt != FRMT_BED){ + if(frmt != FRMT_POS && frmt != FRMT_BED && frmt != FRMT_ONE){ string msg("Unknown output format: "); msg.append(Util::int2string(frmt)); msg.append(". The known formats are: "); msg.append(Util::int2string(FRMT_POS)); msg.append(" and "); msg.append(Util::int2string(FRMT_BED)); + msg.append(" and "); + msg.append(Util::int2string(FRMT_ONE)); msg.append("."); throw InvalidInputException(msg); } @@ -334,6 +336,13 @@ void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { outIndex << ((int) (regionList->at(j)->getEnd() + 1)); outIndex << endl; } + } else if(frmt == FRMT_ONE){ + for (unsigned int j = 0; j < regionList->size(); j++) { + outIndex << header.substr(1) << "\t"; + outIndex << ((int) (regionList->at(j)->getStart() +1)) << "\t"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)); + outIndex << endl; + } } outIndex.close(); diff --git a/src_2.0/nonltr/Scanner.h b/src_2.0/nonltr/Scanner.h index e5ecb60..60c3ce4 100644 --- a/src_2.0/nonltr/Scanner.h +++ b/src_2.0/nonltr/Scanner.h @@ -54,6 +54,7 @@ class Scanner { public: static const int FRMT_POS = 1; static const int FRMT_BED = 2; + static const int FRMT_ONE = 3; Scanner(HMM *, int, ChromosomeOneDigit *, string); Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *);