Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Parameters::Parameters():
PARAM_ALPH_SIZE(PARAM_ALPH_SIZE_ID, "--alph-size", "Alphabet size", "Alphabet size (range 2-21)", typeid(MultiParam<NuclAA<int>>), (void *) &alphabetSize, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID, "--max-seq-len", "Max sequence length", "Maximum sequence length", typeid(size_t), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID, "--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_USE_AUX_SCORING(PARAM_USE_AUX_SCORING_ID, "--aux-score", "Auxiliary scoring", "Use auxiliary sequence scoring during prefilter", typeid(bool), (void *) &useAuxScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID, "--exact-kmer-matching", "Exact k-mer matching", "Extract only exact k-mers for matching (range 0-1)", typeid(int), (void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in prefilter stage with tantan: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_MASK_PROBABILTY(PARAM_MASK_PROBABILTY_ID, "--mask-prob", "Mask residues probability", "Mask sequences is probablity is above threshold", typeid(float), (void *) &maskProb, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -481,6 +482,7 @@ Parameters::Parameters():
prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR);
prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
prefilter.push_back(&PARAM_DIAGONAL_SCORING);
prefilter.push_back(&PARAM_USE_AUX_SCORING);
prefilter.push_back(&PARAM_EXACT_KMER_MATCHING);
prefilter.push_back(&PARAM_MASK_RESIDUES);
prefilter.push_back(&PARAM_MASK_PROBABILTY);
Expand Down Expand Up @@ -2506,6 +2508,7 @@ void Parameters::setDefaults() {
compBiasCorrection = 1;
compBiasCorrectionScale = 1.0;
diagonalScoring = true;
useAuxScoring = true;
exactKmerMatching = 0;
maskMode = 1;
maskProb = 0.9;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ class Parameters {
float compBiasCorrectionScale; // Aminoacid composiont correction scale factor

bool diagonalScoring; // switch diagonal scoring
bool useAuxScoring; // use auxiliary sequence scoring in prefilter
int exactKmerMatching; // only exact k-mer matching
int maskMode; // mask low complex areas
float maskProb; // mask probability
Expand Down Expand Up @@ -811,6 +812,7 @@ class Parameters {
PARAMETER(PARAM_ALPH_SIZE)
PARAMETER(PARAM_MAX_SEQ_LEN)
PARAMETER(PARAM_DIAGONAL_SCORING)
PARAMETER(PARAM_USE_AUX_SCORING)
PARAMETER(PARAM_EXACT_KMER_MATCHING)
PARAMETER(PARAM_MASK_RESIDUES)
PARAMETER(PARAM_MASK_PROBABILTY)
Expand Down
8 changes: 5 additions & 3 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ Prefiltering::Prefiltering(const std::string &queryDB,
}

// Detect extended dbtype and load auxiliary matrix for dual ungapped scoring
// Both query and target must have aux data for dual scoring
ungappedSubMatAux = NULL;
const Sequence::SeqAuxInfo *auxInfo = Sequence::getAuxInfo(targetSeqType);
if (auxInfo != NULL && auxInfo->auxMatData != NULL) {
const Sequence::SeqAuxInfo *auxInfoTarget = Sequence::getAuxInfo(targetSeqType);
const Sequence::SeqAuxInfo *auxInfoQuery = Sequence::getAuxInfo(querySeqType);
if (par.useAuxScoring && auxInfoTarget != NULL && auxInfoQuery != NULL && auxInfoTarget->auxMatData != NULL) {
std::string matName("aux.out");
std::string matData(reinterpret_cast<const char*>(auxInfo->auxMatData), auxInfo->auxMatDataLen);
std::string matData(reinterpret_cast<const char*>(auxInfoTarget->auxMatData), auxInfoTarget->auxMatDataLen);
char *serialized = BaseMatrix::serialize(matName, matData);
ungappedSubMatAux = new SubstitutionMatrix(serialized, 2.0, -0.2f);
free(serialized);
Expand Down
52 changes: 44 additions & 8 deletions src/prefiltering/QueryMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,51 @@ std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false);
}
}else{
size_t resultPos = 0;
for (size_t i = 0; i < resultSize; i++) {
resultReadPos[resultPos].id = resultReadPos[i].id;
resultReadPos[resultPos].diagonal = resultReadPos[i].diagonal;
resultReadPos[resultPos].count = resultReadPos[i].count;
resultPos += (resultReadPos[i].count >= diagonalThr);
unsigned int maxDiagonalScoreThr = (UCHAR_MAX - ungappedAlignment->getQueryBias());
bool scoreIsTruncated = (diagonalThr >= maxDiagonalScoreThr);
size_t availableSpace = foundDiagonalsSize - (resultReadPos - foundDiagonals);
bool rescored = false;

if (scoreIsTruncated) {
// Count truncated-score hits from already-computed scoreSizes
size_t truncatedCount = 0;
for (unsigned int s = maxDiagonalScoreThr; s < SCORE_RANGE; s++) {
truncatedCount += scoreSizes[s];
}

if (truncatedCount * 2 <= availableSpace) {
// Filter in-place to only truncated-score hits
size_t filteredSize = 0;
for (size_t i = 0; i < resultSize; i++) {
if (resultReadPos[i].count >= maxDiagonalScoreThr) {
resultReadPos[filteredSize] = resultReadPos[i];
filteredSize++;
}
}
// Now we have room for double-buffer rescoring
resultWritePos = resultReadPos + filteredSize;
memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
std::pair<size_t, unsigned int> rescoreResult = rescoreHits(querySeq, scoreSizes, resultReadPos, filteredSize, ungappedAlignment, maxDiagonalScoreThr);
size_t newResultSize = rescoreResult.first;
unsigned int maxSelfScoreMinusDiag = rescoreResult.second;
size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, resultWritePos, 0, resultReadPos, newResultSize);
std::swap(resultReadPos, resultWritePos);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag);
rescored = true;
}
}

if (!rescored) {
size_t resultPos = 0;
for (size_t i = 0; i < resultSize; i++) {
resultReadPos[resultPos].id = resultReadPos[i].id;
resultReadPos[resultPos].diagonal = resultReadPos[i].diagonal;
resultReadPos[resultPos].count = resultReadPos[i].count;
resultPos += (resultReadPos[i].count >= diagonalThr);
}
SORT_SERIAL(resultReadPos, resultReadPos + resultPos, CounterResult::sortScore);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, resultPos, identityId, diagonalThr, ungappedAlignment, false);
}
SORT_SERIAL(resultReadPos, resultReadPos + resultPos, CounterResult::sortScore);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, resultPos, identityId, diagonalThr, ungappedAlignment, false);
}
}else{
unsigned int thr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery);
Expand Down
57 changes: 55 additions & 2 deletions src/util/convert2fasta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@

#include <cstring>
#include <cstdio>
#include <string>
#include <vector>

#include "Parameters.h"
#include "DBReader.h"
#include "Debug.h"
#include "Util.h"
#include "FileUtil.h"
#include "Sequence.h"
#include "SubstitutionMatrix.h"

const char headerStart[] = {'>'};
const char newline[] = {'\n'};
Expand All @@ -31,13 +35,35 @@ int convert2fasta(int argc, const char **argv, const Command& command) {
EXIT(EXIT_FAILURE);
}

// Check for combined primary+aux format
const Sequence::SeqAuxInfo* auxInfo = Sequence::getAuxInfo(db.getDbtype());
FILE* auxFastaFP = NULL;
std::string auxPath;
SubstitutionMatrix* subMat = NULL;
if (auxInfo != NULL) {
const std::string ext = ".fasta";
if (par.db2.length() >= ext.length() &&
par.db2.substr(par.db2.length() - ext.length()) == ext) {
auxPath = par.db2.substr(0, par.db2.length() - ext.length()) + ".aux.fasta";
} else {
auxPath = par.db2 + ".aux";
}
auxFastaFP = fopen(auxPath.c_str(), "w");
if (auxFastaFP == NULL) {
perror(auxPath.c_str());
fclose(fastaFP);
EXIT(EXIT_FAILURE);
}
subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.1, 0.0);
}

DBReader<unsigned int>* from = &db;
if(par.useHeaderFile) {
from = &db_header;
}

Debug(Debug::INFO) << "Start writing file to " << par.db2 << "\n";
std::vector<char> decodedBuf;
for(size_t i = 0; i < from->getSize(); i++){
unsigned int key = from->getDbKey(i);
unsigned int headerKey = db_header.getId(key);
Expand All @@ -51,13 +77,40 @@ int convert2fasta(int argc, const char **argv, const Command& command) {
unsigned int bodyKey = db.getId(key);
const char* bodyData = db.getData(bodyKey, 0);
const size_t bodyLen = db.getEntryLen(bodyKey);
fwrite(bodyData, sizeof(char), bodyLen - 2, fastaFP);
fwrite(newline, sizeof(char), 1, fastaFP);
size_t seqLen = bodyLen - 2;

if (auxInfo != NULL) {
if (decodedBuf.size() < seqLen) {
decodedBuf.resize(seqLen);
}
for (size_t j = 0; j < seqLen; j++) {
decodedBuf[j] = subMat->num2aa[auxInfo->primaryRemap[(unsigned char)bodyData[j]]];
}
fwrite(decodedBuf.data(), sizeof(char), seqLen, fastaFP);
fwrite(newline, sizeof(char), 1, fastaFP);

fwrite(headerStart, sizeof(char), 1, auxFastaFP);
fwrite(headerData, sizeof(char), headerLen - 2, auxFastaFP);
fwrite(newline, sizeof(char), 1, auxFastaFP);
for (size_t j = 0; j < seqLen; j++) {
decodedBuf[j] = subMat->num2aa[auxInfo->auxRemap[(unsigned char)bodyData[j]]];
}
fwrite(decodedBuf.data(), sizeof(char), seqLen, auxFastaFP);
fwrite(newline, sizeof(char), 1, auxFastaFP);
} else {
fwrite(bodyData, sizeof(char), seqLen, fastaFP);
fwrite(newline, sizeof(char), 1, fastaFP);
}
}
if (fclose(fastaFP) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << par.db2 << "\n";
EXIT(EXIT_FAILURE);
}
if (auxFastaFP != NULL && fclose(auxFastaFP) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << auxPath << "\n";
EXIT(EXIT_FAILURE);
}
delete subMat;
db_header.close();
db.close();

Expand Down