BioinformaticsToolsmith · brunocontrerasmoreira · Sep 9, 2020 · Sep 9, 2020 · Sep 9, 2020 · Sep 9, 2020
diff --git a/README.md b/README.md
@@ -1,21 +1,103 @@
 # Red
 Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. 
 
+This fork of the original at https://github.com/BioinformaticsToolsmith/Red adds usability information and allows compilation with g++9 (eg on Ubuntu 2004)
+
+
+## Compiling the source code
+
 Requirement: GNU gcc8.2 or higher. Please change the name (CXX) of the compiler in the Makefile. 
-
-Compiling the source code
 
-The following command makes the required directories: 
-> make bin
+```
+# get the code
+git clone https://github.com/brunocontrerasmoreira/Red
+
+cd Red && cd src_2.0
+
+# The following command makes the required directories: 
+make bin
+
+# The following command makes the binary that is located under the ``bin'' directory:
+make -j 4
+
+# To find the binary:
+cd ../bin
+
+# Run the binary
+./Red
+```
+
+## Example command
+
+```
+# only genomes in .fa format are detected properly - and will be DELETED by Red
+cp /mnt/genome.fasta input.fa
+
+# out dirs must all exist else core dump
+mkdir -p out_mask output
 
-The following command makes the binary that is located under the ``bin'' directory:
-> make 
+# Run Red with an input.fa in the current dir . 
+Red -gnm . -msk out_mask -rpt output
+```
+
+**Beware: note that .fa files in the genome directory will be deleted by Red - so only copy them in. You will have to copy the genome in again before the next run**
+
+
+## Full usage
+
+```
+This is Red (REpeat Detector) designed and developed by Hani Zakaria Girgis, PhD.
+
+Version: 2.0
+
+Argument pairs of the form: -flag value are required.
+Valid argument pairs:
+        -gnm input genome directory, required.
+                Files with ".fa" extension in this directory are used for completing the table of the adjusted counts.
+                These Files are scanned for repeats.
+        -dir directory including additional input sequences, optional.
+                Files with ".fa" extension in this directory are NOT used for completing the table.
+                These Files MUST have different names from those in the genome directory.
+                These Files are scanned for repeats.
+        -len word length equals k defining the k-mer. The default is floor(log_4(genome size)).
+        -ord order of the background Markov chain. The default is floor(k/2)-1.
+        -gau half width of the mask. The default is based on the GC content.
+                20 if the GC content > 33% and < 67%, 40 otherwise.
+        -thr the threshold score of the low adjusted scores of non-repeats. The default is 2.
+        -min the minimum number of the observed k-mers. The default is 3.
+        -tbl file where the table of the adjusted counts is written, optional.
+        -sco directory where scores are saved, optional.
+                Score files have the ".scr" extension.
+        -cnd directory where candidate regions are saved, optional.
+                Candidates files have the ".cnd" extension.
+        -rpt directory where repeats locations are saved, optional.
+                Repeats files have the ".rpt" extension.
+        -msk directory where masked sequences are saved, optional.
+                Masked sequences files have the ".msk" extension.
+        -frm the format of the output: 1 (chrName:start-end), 
+             2 (chrName start   end) or 3 (chrName      start   end).
+                Output formats 1 & 2 are zero-based, end exclusive.
+                Output format 3 is one-based, end inclusive (Ensembl).
+        -hmo file where the HMM is saved, optional.
+        -cor integer of the number of threads, optional.
+                The more threads, the higher the memory requirement.
+                The defaul is the number of cores - 1, or 1 if single core is found.
+
+Examples:
+        The following command runs Red with the defaults and generates the masked sequences.
+        Red -gnm genome_directory -msk output_directory
+
+        The following command runs Red with the defaults and generates the masked sequences and the locations of repeats.
+        Red -gnm genome_directory -msk output_directory -rpt output_directory
+```
 
-To find the binary:
-> cd ../bin
 
 Please cite the following paper:
 
 Girgis, H.Z. (2015) Red: an intelligent, rapid, accurate tool for
 detecting repeats de-novo on the genomic scale. BMC Bioinformatics,
 16, 227.
+
+Original Repo 
+
+https://github.com/BioinformaticsToolsmith/Red
diff --git a/src_2.0/Makefile b/src_2.0/Makefile
@@ -2,7 +2,8 @@
 
 # CXX = /usr/bin/c++
 # CXX = /usr/bin/g++
-CXX = g++-8
+#CXX = g++-8
+CXX = g++-9
 
 CXXFLAGS = -std=c++14 -fopenmp -O3 -g -fmessage-length=0 -Wall -fpermissive
 

diff --git a/src_2.0/RepeatsDetector.cpp b/src_2.0/RepeatsDetector.cpp
@@ -230,6 +230,8 @@ void drive(map<string, string> * const param) {
 					string ext(".rpt");
 					if (atoi(param->at(FRM_PRM).c_str()) == 2) {
 						ext = string(".bed");
+					} else if (atoi(param->at(FRM_PRM).c_str()) == 3) {
+                  ext = string(".tsv");
 					}
 					string rptFile = param->at(RPT_PRM) + Util::fileSeparator
 							+ nickName + ext;
@@ -354,9 +356,13 @@ int main(int argc, char * argv[]) {
 	message.append("\t\tMasked sequences files have the \".msk\" extension.\n");
 
 	message.append(
-			"\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n");
+			"\t-frm the format of the output: 1 (chrName:start-end), \n");
 	message.append(
-			"\t\tThe output format are zero based and the end is exclusive.\n");
+   		"\t     2 (chrName\tstart\tend) or 3 (chrName\tstart\tend).\n");
+	message.append(
+			"\t\tOutput formats 1 & 2 are zero-based, end exclusive.\n");
+	message.append(
+   		"\t\tOutput format 3 is one-based, end inclusive (Ensembl).\n");
 	message.append("\t-hmo file where the HMM is saved, optional.\n");
 	message.append("\t-cor integer of the number of threads, optional.\n");
 	message.append("\t\tThe more threads, the higher the memory requirement.\n");
@@ -487,10 +493,12 @@ int main(int argc, char * argv[]) {
 							Util::int2string(Scanner::FRMT_POS)));
 		} else {
 			if (atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_POS
-					&& atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED) {
+					&& atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED
+					&& atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_ONE) {
 				cerr << "The output format must be " << Scanner::FRMT_POS
 						<< " or ";
-				cerr << Scanner::FRMT_BED << ". The format received is ";
+				cerr << Scanner::FRMT_BED << " or ";
+				cerr << Scanner::FRMT_ONE << ". The format received is ";
 				cerr << param->at(FRM_PRM) << "." << endl;
 				return 1;
 			}

diff --git a/src_2.0/nonltr/Scanner.cpp b/src_2.0/nonltr/Scanner.cpp
@@ -299,13 +299,15 @@ void Scanner::printScores(string outputFile, bool canAppend) {
 
 void Scanner::printIndex(string outputFile, bool canAppend, int frmt) {
 
-	if(frmt != FRMT_POS && frmt != FRMT_BED){
+	if(frmt != FRMT_POS && frmt != FRMT_BED && frmt != FRMT_ONE){
 		string msg("Unknown output format: ");
 		msg.append(Util::int2string(frmt));
 		msg.append(". The known formats are: ");
 		msg.append(Util::int2string(FRMT_POS));
 		msg.append(" and ");
 		msg.append(Util::int2string(FRMT_BED));
+		msg.append(" and ");
+		msg.append(Util::int2string(FRMT_ONE));
 		msg.append(".");
 		throw InvalidInputException(msg);
 	}
@@ -334,6 +336,13 @@ void Scanner::printIndex(string outputFile, bool canAppend, int frmt) {
 			outIndex << ((int) (regionList->at(j)->getEnd() + 1));
 			outIndex << endl;
 		}
+	} else if(frmt == FRMT_ONE){
+		for (unsigned int j = 0; j < regionList->size(); j++) {
+			outIndex << header.substr(1) << "\t";
+			outIndex << ((int) (regionList->at(j)->getStart() +1)) << "\t";
+			outIndex << ((int) (regionList->at(j)->getEnd() + 1));
+			outIndex << endl;
+		}
 	}
 
 	outIndex.close();

diff --git a/src_2.0/nonltr/Scanner.h b/src_2.0/nonltr/Scanner.h
@@ -54,6 +54,7 @@ class Scanner {
 public:
 	static const int FRMT_POS = 1;
 	static const int FRMT_BED = 2;
+	static const int FRMT_ONE = 3;   
 
 	Scanner(HMM *, int, ChromosomeOneDigit *, string);
 	Scanner(HMM *, int, ChromosomeOneDigit *, ITableView<unsigned long, int> *);