From 740db4c460d696f2d38591713e9b111b2b43f925 Mon Sep 17 00:00:00 2001 From: mvankem Date: Tue, 10 Mar 2026 00:03:00 +0100 Subject: [PATCH 1/5] convert2fasta handles databases with aux data --- src/util/convert2fasta.cpp | 57 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/util/convert2fasta.cpp b/src/util/convert2fasta.cpp index b70ecc7f4..72231906a 100644 --- a/src/util/convert2fasta.cpp +++ b/src/util/convert2fasta.cpp @@ -5,12 +5,16 @@ #include #include +#include +#include #include "Parameters.h" #include "DBReader.h" #include "Debug.h" #include "Util.h" #include "FileUtil.h" +#include "Sequence.h" +#include "SubstitutionMatrix.h" const char headerStart[] = {'>'}; const char newline[] = {'\n'}; @@ -31,6 +35,27 @@ int convert2fasta(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } + // Check for combined primary+aux format + const Sequence::SeqAuxInfo* auxInfo = Sequence::getAuxInfo(db.getDbtype()); + FILE* auxFastaFP = NULL; + std::string auxPath; + SubstitutionMatrix* subMat = NULL; + if (auxInfo != NULL) { + const std::string ext = ".fasta"; + if (par.db2.length() >= ext.length() && + par.db2.substr(par.db2.length() - ext.length()) == ext) { + auxPath = par.db2.substr(0, par.db2.length() - ext.length()) + ".aux.fasta"; + } else { + auxPath = par.db2 + ".aux"; + } + auxFastaFP = fopen(auxPath.c_str(), "w"); + if (auxFastaFP == NULL) { + perror(auxPath.c_str()); + fclose(fastaFP); + EXIT(EXIT_FAILURE); + } + subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.1, 0.0); + } DBReader* from = &db; if(par.useHeaderFile) { @@ -38,6 +63,7 @@ int convert2fasta(int argc, const char **argv, const Command& command) { } Debug(Debug::INFO) << "Start writing file to " << par.db2 << "\n"; + std::vector decodedBuf; for(size_t i = 0; i < from->getSize(); i++){ unsigned int key = from->getDbKey(i); unsigned int headerKey = db_header.getId(key); @@ -51,13 +77,40 @@ int convert2fasta(int argc, const char **argv, const Command& command) { unsigned int bodyKey = db.getId(key); const char* bodyData = db.getData(bodyKey, 0); const size_t bodyLen = db.getEntryLen(bodyKey); - fwrite(bodyData, sizeof(char), bodyLen - 2, fastaFP); - fwrite(newline, sizeof(char), 1, fastaFP); + size_t seqLen = bodyLen - 2; + + if (auxInfo != NULL) { + if (decodedBuf.size() < seqLen) { + decodedBuf.resize(seqLen); + } + for (size_t j = 0; j < seqLen; j++) { + decodedBuf[j] = subMat->num2aa[auxInfo->primaryRemap[(unsigned char)bodyData[j]]]; + } + fwrite(decodedBuf.data(), sizeof(char), seqLen, fastaFP); + fwrite(newline, sizeof(char), 1, fastaFP); + + fwrite(headerStart, sizeof(char), 1, auxFastaFP); + fwrite(headerData, sizeof(char), headerLen - 2, auxFastaFP); + fwrite(newline, sizeof(char), 1, auxFastaFP); + for (size_t j = 0; j < seqLen; j++) { + decodedBuf[j] = subMat->num2aa[auxInfo->auxRemap[(unsigned char)bodyData[j]]]; + } + fwrite(decodedBuf.data(), sizeof(char), seqLen, auxFastaFP); + fwrite(newline, sizeof(char), 1, auxFastaFP); + } else { + fwrite(bodyData, sizeof(char), seqLen, fastaFP); + fwrite(newline, sizeof(char), 1, fastaFP); + } } if (fclose(fastaFP) != 0) { Debug(Debug::ERROR) << "Cannot close file " << par.db2 << "\n"; EXIT(EXIT_FAILURE); } + if (auxFastaFP != NULL && fclose(auxFastaFP) != 0) { + Debug(Debug::ERROR) << "Cannot close file " << auxPath << "\n"; + EXIT(EXIT_FAILURE); + } + delete subMat; db_header.close(); db.close(); From 5e9ad3aaedfef67fd7472f2f46f2e794ba2f26b6 Mon Sep 17 00:00:00 2001 From: Michel van Kempen Date: Tue, 24 Mar 2026 03:09:03 +0100 Subject: [PATCH 2/5] QueryMatcher: Rescore only truncated-score diagonals when buffer is too full for radix sort --- src/prefiltering/QueryMatcher.cpp | 68 +++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp index 067e661c8..cc4929d5a 100644 --- a/src/prefiltering/QueryMatcher.cpp +++ b/src/prefiltering/QueryMatcher.cpp @@ -2,6 +2,7 @@ #include "QueryMatcher.h" #include "FastSort.h" #include "Util.h" +#include #define FE_1(WHAT, X) WHAT(X) #define FE_2(WHAT, X, ...) WHAT(X)FE_1(WHAT, __VA_ARGS__) @@ -202,15 +203,66 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned queryResult = getResult(resultReadPos, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false); } }else{ - size_t resultPos = 0; - for (size_t i = 0; i < resultSize; i++) { - resultReadPos[resultPos].id = resultReadPos[i].id; - resultReadPos[resultPos].diagonal = resultReadPos[i].diagonal; - resultReadPos[resultPos].count = resultReadPos[i].count; - resultPos += (resultReadPos[i].count >= diagonalThr); + unsigned int maxDiagonalScoreThr = (UCHAR_MAX - ungappedAlignment->getQueryBias()); + bool scoreIsTruncated = (diagonalThr >= maxDiagonalScoreThr); + size_t availableSpace = foundDiagonalsSize - (resultReadPos - foundDiagonals); + bool rescored = false; + + if (scoreIsTruncated) { + if (omp_get_thread_num() == 0) { + printf("Buffer-overflow branch (truncated): resultSize=%zu foundDiagonalsSize=%zu diagonalThr=%u maxDiagonalScoreThr=%u\n", + resultSize, foundDiagonalsSize, diagonalThr, maxDiagonalScoreThr); + } + // Count truncated-score hits from already-computed scoreSizes + size_t truncatedCount = 0; + for (unsigned int s = maxDiagonalScoreThr; s < SCORE_RANGE; s++) { + truncatedCount += scoreSizes[s]; + } + + if (omp_get_thread_num() == 0) { + printf("Truncated-score diagonals: %zu / %zu total diagonals (availableSpace=%zu)\n", + truncatedCount, resultSize, availableSpace); + } + + if (truncatedCount * 2 <= availableSpace) { + // Filter in-place to only truncated-score hits + size_t filteredSize = 0; + for (size_t i = 0; i < resultSize; i++) { + if (resultReadPos[i].count >= maxDiagonalScoreThr) { + resultReadPos[filteredSize] = resultReadPos[i]; + filteredSize++; + } + } + // Now we have room for double-buffer rescoring + resultWritePos = resultReadPos + filteredSize; + memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); + std::pair rescoreResult = rescoreHits(querySeq, scoreSizes, resultReadPos, filteredSize, ungappedAlignment, maxDiagonalScoreThr); + size_t newResultSize = rescoreResult.first; + unsigned int maxSelfScoreMinusDiag = rescoreResult.second; + size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, resultWritePos, 0, resultReadPos, newResultSize); + std::swap(resultReadPos, resultWritePos); + queryResult = getResult(resultReadPos, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag); + rescored = true; + if (omp_get_thread_num() == 0) { + printf("Filtering helped: avoided serial sort, rescored %zu truncated diagonals\n", filteredSize); + } + } + } + + if (!rescored) { + if (scoreIsTruncated && omp_get_thread_num() == 0) { + printf("Filtering did not help: truncated diagonals don't fit, falling back to serial sort (resultSize=%zu)\n", resultSize); + } + size_t resultPos = 0; + for (size_t i = 0; i < resultSize; i++) { + resultReadPos[resultPos].id = resultReadPos[i].id; + resultReadPos[resultPos].diagonal = resultReadPos[i].diagonal; + resultReadPos[resultPos].count = resultReadPos[i].count; + resultPos += (resultReadPos[i].count >= diagonalThr); + } + SORT_SERIAL(resultReadPos, resultReadPos + resultPos, CounterResult::sortScore); + queryResult = getResult(resultReadPos, resultPos, identityId, diagonalThr, ungappedAlignment, false); } - SORT_SERIAL(resultReadPos, resultReadPos + resultPos, CounterResult::sortScore); - queryResult = getResult(resultReadPos, resultPos, identityId, diagonalThr, ungappedAlignment, false); } }else{ unsigned int thr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery); From d93dca81e987a414c8e60bc142cac355140c5310 Mon Sep 17 00:00:00 2001 From: Michel van Kempen Date: Tue, 31 Mar 2026 10:55:39 +0200 Subject: [PATCH 3/5] Add --aux-score flag Optionally disable the use of aux seqs in the prefilter. --- src/commons/Parameters.cpp | 3 +++ src/commons/Parameters.h | 2 ++ src/prefiltering/Prefiltering.cpp | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 58f4c78b2..743341c8f 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -43,6 +43,7 @@ Parameters::Parameters(): PARAM_ALPH_SIZE(PARAM_ALPH_SIZE_ID, "--alph-size", "Alphabet size", "Alphabet size (range 2-21)", typeid(MultiParam>), (void *) &alphabetSize, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID, "--max-seq-len", "Max sequence length", "Maximum sequence length", typeid(size_t), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID, "--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), + PARAM_USE_AUX_SCORING(PARAM_USE_AUX_SCORING_ID, "--aux-score", "Auxiliary scoring", "Use auxiliary sequence scoring during prefilter", typeid(bool), (void *) &useAuxScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID, "--exact-kmer-matching", "Exact k-mer matching", "Extract only exact k-mers for matching (range 0-1)", typeid(int), (void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in prefilter stage with tantan: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_MASK_PROBABILTY(PARAM_MASK_PROBABILTY_ID, "--mask-prob", "Mask residues probability", "Mask sequences is probablity is above threshold", typeid(float), (void *) &maskProb, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), @@ -481,6 +482,7 @@ Parameters::Parameters(): prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR); prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); prefilter.push_back(&PARAM_DIAGONAL_SCORING); + prefilter.push_back(&PARAM_USE_AUX_SCORING); prefilter.push_back(&PARAM_EXACT_KMER_MATCHING); prefilter.push_back(&PARAM_MASK_RESIDUES); prefilter.push_back(&PARAM_MASK_PROBABILTY); @@ -2506,6 +2508,7 @@ void Parameters::setDefaults() { compBiasCorrection = 1; compBiasCorrectionScale = 1.0; diagonalScoring = true; + useAuxScoring = true; exactKmerMatching = 0; maskMode = 1; maskProb = 0.9; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index c4c9da434..e49098ae2 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -428,6 +428,7 @@ class Parameters { float compBiasCorrectionScale; // Aminoacid composiont correction scale factor bool diagonalScoring; // switch diagonal scoring + bool useAuxScoring; // use auxiliary sequence scoring in prefilter int exactKmerMatching; // only exact k-mer matching int maskMode; // mask low complex areas float maskProb; // mask probability @@ -811,6 +812,7 @@ class Parameters { PARAMETER(PARAM_ALPH_SIZE) PARAMETER(PARAM_MAX_SEQ_LEN) PARAMETER(PARAM_DIAGONAL_SCORING) + PARAMETER(PARAM_USE_AUX_SCORING) PARAMETER(PARAM_EXACT_KMER_MATCHING) PARAMETER(PARAM_MASK_RESIDUES) PARAMETER(PARAM_MASK_PROBABILTY) diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 92d957f34..32babf099 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -83,7 +83,7 @@ Prefiltering::Prefiltering(const std::string &queryDB, // Detect extended dbtype and load auxiliary matrix for dual ungapped scoring ungappedSubMatAux = NULL; const Sequence::SeqAuxInfo *auxInfo = Sequence::getAuxInfo(targetSeqType); - if (auxInfo != NULL && auxInfo->auxMatData != NULL) { + if (par.useAuxScoring && auxInfo != NULL && auxInfo->auxMatData != NULL) { std::string matName("aux.out"); std::string matData(reinterpret_cast(auxInfo->auxMatData), auxInfo->auxMatDataLen); char *serialized = BaseMatrix::serialize(matName, matData); From eccd9bd96f00e3eabec828aa042166954c6d8809 Mon Sep 17 00:00:00 2001 From: Michel van Kempen Date: Mon, 6 Apr 2026 11:49:32 +0900 Subject: [PATCH 4/5] Fix prefiltering query without aux data against targets with aux data --- src/prefiltering/Prefiltering.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 32babf099..7f1dde701 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -81,11 +81,13 @@ Prefiltering::Prefiltering(const std::string &queryDB, } // Detect extended dbtype and load auxiliary matrix for dual ungapped scoring + // Both query and target must have aux data for dual scoring ungappedSubMatAux = NULL; - const Sequence::SeqAuxInfo *auxInfo = Sequence::getAuxInfo(targetSeqType); - if (par.useAuxScoring && auxInfo != NULL && auxInfo->auxMatData != NULL) { + const Sequence::SeqAuxInfo *auxInfoTarget = Sequence::getAuxInfo(targetSeqType); + const Sequence::SeqAuxInfo *auxInfoQuery = Sequence::getAuxInfo(querySeqType); + if (par.useAuxScoring && auxInfoTarget != NULL && auxInfoQuery != NULL && auxInfoTarget->auxMatData != NULL) { std::string matName("aux.out"); - std::string matData(reinterpret_cast(auxInfo->auxMatData), auxInfo->auxMatDataLen); + std::string matData(reinterpret_cast(auxInfoTarget->auxMatData), auxInfoTarget->auxMatDataLen); char *serialized = BaseMatrix::serialize(matName, matData); ungappedSubMatAux = new SubstitutionMatrix(serialized, 2.0, -0.2f); free(serialized); From 9adeedc3ed433fd6a9f95c05a61ad1257e97ab45 Mon Sep 17 00:00:00 2001 From: Michel van Kempen Date: Thu, 23 Apr 2026 08:31:59 +0200 Subject: [PATCH 5/5] QueryMatcher: Remove debug logging from truncated-score rescoring --- src/prefiltering/QueryMatcher.cpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp index cc4929d5a..6636ae03d 100644 --- a/src/prefiltering/QueryMatcher.cpp +++ b/src/prefiltering/QueryMatcher.cpp @@ -2,7 +2,6 @@ #include "QueryMatcher.h" #include "FastSort.h" #include "Util.h" -#include #define FE_1(WHAT, X) WHAT(X) #define FE_2(WHAT, X, ...) WHAT(X)FE_1(WHAT, __VA_ARGS__) @@ -209,21 +208,12 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned bool rescored = false; if (scoreIsTruncated) { - if (omp_get_thread_num() == 0) { - printf("Buffer-overflow branch (truncated): resultSize=%zu foundDiagonalsSize=%zu diagonalThr=%u maxDiagonalScoreThr=%u\n", - resultSize, foundDiagonalsSize, diagonalThr, maxDiagonalScoreThr); - } // Count truncated-score hits from already-computed scoreSizes size_t truncatedCount = 0; for (unsigned int s = maxDiagonalScoreThr; s < SCORE_RANGE; s++) { truncatedCount += scoreSizes[s]; } - if (omp_get_thread_num() == 0) { - printf("Truncated-score diagonals: %zu / %zu total diagonals (availableSpace=%zu)\n", - truncatedCount, resultSize, availableSpace); - } - if (truncatedCount * 2 <= availableSpace) { // Filter in-place to only truncated-score hits size_t filteredSize = 0; @@ -243,16 +233,10 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned std::swap(resultReadPos, resultWritePos); queryResult = getResult(resultReadPos, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag); rescored = true; - if (omp_get_thread_num() == 0) { - printf("Filtering helped: avoided serial sort, rescored %zu truncated diagonals\n", filteredSize); - } } } if (!rescored) { - if (scoreIsTruncated && omp_get_thread_num() == 0) { - printf("Filtering did not help: truncated diagonals don't fit, falling back to serial sort (resultSize=%zu)\n", resultSize); - } size_t resultPos = 0; for (size_t i = 0; i < resultSize; i++) { resultReadPos[resultPos].id = resultReadPos[i].id;