Skip to content

Commit 4c9ed4a

Browse files
repairing indexing of root file
Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
1 parent 9028ee7 commit 4c9ed4a

File tree

12 files changed

+304
-144
lines changed

12 files changed

+304
-144
lines changed

CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ if(RAMTOOLS_BUILD_TOOLS)
7474
add_subdirectory(tools)
7575
endif()
7676

77-
# Include benchmark directory if benchmarks OR tests are enabled
78-
# (sam_generator is needed by tests)
7977
if(RAMTOOLS_BUILD_BENCHMARKS OR RAMTOOLS_BUILD_TESTS)
8078
add_subdirectory(benchmark)
8179
endif()

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,24 @@ Tested with HG00154 sample from the 1000 Genomes Project (196M reads, 72GB SAM f
102102
- LZ4 compression provides the best query performance among all compression algorithms
103103
- For a 100Mb region query: RNTuple processes **453,736 reads/sec** vs TTree+ZLIB's **197,815 reads/sec**
104104

105+
### RNTuple vs. CRAM (samtools) Region Queries
106+
107+
Measured on `HG00096.cram` (GRCh37 reference, samtools `view -c -F 2308`) versus the converted RAM RNTuple produced by `samtoramntuple`. Benchmarks run on an Intel i5-8250U laptop with SSD storage.
108+
109+
| Query (GRCh37) | Reads | RNTuple `ramntupleview` Wall (s) | samtools (CRAM) Wall (s) | Relative Speed (`CRAM / RNTuple`) |
110+
|----------------|-------|----------------------------------|---------------------------|-----------------------------------|
111+
| chr1:1,000,000-1,001,000 | 6 | 12.6 | 0.42 | 0.03× |
112+
| chr1:1,000,000-2,000,000 | 41,608 | 5.65 | 0.31 | 0.06× |
113+
| chr1:1-50,000,000 | 2,921,678 | 6.52 | 9.41 | **1.44×** |
114+
| chr2:1-100,000,000 | 6,879,654 | 7.75 | 20.19 | **2.61×** |
115+
| chr7:50,000,000-150,000,000 | 6,698,314 | 7.96 | 17.80 | **2.24×** |
116+
| chr21:1-48,129,895 | 2,510,708 | 6.58 | 8.29 | **1.26×** |
117+
118+
**Interpretation**
119+
- RNTuple has higher constant overhead for tiny windows (dozens of reads); CRAM via samtools remains faster for ad‑hoc lookups.
120+
- For broad regions that touch millions of records, the columnar layout and sparse index give RNTuple a **1.3×–2.6× throughput advantage** over samtools.
121+
- Large windows (≥50 Mb) saturate sequential access on both formats; RNTuple’s page-level prefetch keeps bandwidth high while retaining exact read counts matching samtools.
122+
105123
## TTree Implementation (Legacy)
106124

107125
ROOT scripts to convert a SAM file to a RAM (ROOT Alignment/Map) file using the older TTree format and to work with those files.
Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
#include <benchmark/benchmark.h>
22
#include "generate_sam_benchmark.h"
3-
#include <cstdio> // For std::remove
3+
#include <cstdio>
44

5-
static void BM_GenerateSAM(benchmark::State &state)
6-
{
7-
int num_reads = state.range(0);
8-
for (auto _ : state) {
9-
GenerateSAMFile("benchmark_temp.sam", num_reads);
10-
std::remove("benchmark_temp.sam");
11-
}
5+
static void BM_GenerateSAM(benchmark::State& state) {
6+
int num_reads = state.range(0);
7+
for (auto _ : state) {
8+
GenerateSAMFile("benchmark_temp.sam", num_reads);
9+
std::remove("benchmark_temp.sam");
10+
}
1211

13-
state.counters["reads_per_second"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate);
14-
state.counters["bytes_per_second"] = benchmark::Counter(num_reads * 200, benchmark::Counter::kIsRate);
12+
state.counters["reads_per_second"] = benchmark::Counter(
13+
num_reads, benchmark::Counter::kIsRate);
14+
state.counters["bytes_per_second"] = benchmark::Counter(
15+
num_reads * 200, benchmark::Counter::kIsRate);
1516
}
1617

1718
BENCHMARK(BM_GenerateSAM)->Range(100, 10000);
1819
BENCHMARK_MAIN();
20+

benchmark/region_query_benchmark.cxx

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class RegionQueryFixture : public benchmark::Fixture {
2020

2121
sam_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big.sam";
2222
ttree_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big_lzma.root";
23-
rntuple_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_root.root";
23+
rntuple_root_file_ = "/home/aditya/ramtools/build/rntuple.root";
2424
}
2525

2626
void TearDown(const benchmark::State &) override {}
@@ -39,24 +39,24 @@ class RegionQueryFixture : public benchmark::Fixture {
3939
const char *get_current_region() const { return regions_[region_idx_ % regions_.size()].c_str(); }
4040
};
4141

42-
const std::vector<std::string> RegionQueryFixture::regions_ = {"chr1:1000000-1001000",
43-
"chr2:5000000-5010000",
44-
"chrX:100000-150000",
45-
"chr1:1000000-2000000",
46-
"chr5:10000000-15000000",
47-
"chr10:50000000-60000000",
48-
"chr1:1-50000000",
49-
"chr2:1-100000000",
50-
"chr7:50000000-150000000",
51-
"chr21:1-48129895",
52-
"chrM:1-16571",
53-
"chrY:2600000-2700000",
42+
const std::vector<std::string> RegionQueryFixture::regions_ = {"1:1000000-1001000",
43+
"2:5000000-5010000",
44+
"X:100000-150000",
45+
"1:1000000-2000000",
46+
"5:10000000-15000000",
47+
"10:50000000-60000000",
48+
"1:1-50000000",
49+
"2:1-100000000",
50+
"7:50000000-150000000",
51+
"21:1-48129895",
52+
"MT:1-16571",
53+
"Y:2600000-2700000",
5454
"GL000227.1:1-100000",
55-
"chr1:1-1000",
56-
"chr1:249250621-249250621",
57-
"chr22:51304566-51304566",
58-
"chr17:41196312-41277500",
59-
"chr13:32889611-32973805"};
55+
"1:1-1000",
56+
"1:249250621-249250621",
57+
"22:51304566-51304566",
58+
"17:41196312-41277500",
59+
"13:32889611-32973805"};
6060

6161
BENCHMARK_DEFINE_F(RegionQueryFixture, TTree)(benchmark::State &state)
6262
{
@@ -96,8 +96,8 @@ BENCHMARK_DEFINE_F(RegionQueryFixture, RNTuple)(benchmark::State &state)
9696
state.SetLabel(std::to_string(reads_in_this_run) + " reads");
9797
}
9898

99-
BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond);
100-
101-
BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond);
99+
BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);
100+
BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);
102101

103102
BENCHMARK_MAIN();
103+

inc/ramcore/SamToNTuple.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <cstdint>
55
#include <string>
6+
#include <vector>
67

78
void samtoramntuple(const char *datafile,
89
const char *treefile,
@@ -11,6 +12,8 @@ void samtoramntuple(const char *datafile,
1112
uint32_t quality_policy);
1213

1314
void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm,
14-
uint32_t quality_policy, int num_threads = 4);
15+
uint32_t quality_policy, int num_threads = 4,
16+
const std::vector<std::string> &only_chromosomes = {});
1517

1618
#endif
19+

src/ramcore/RAMNTupleView.cxx

Lines changed: 126 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "ramcore/RAMNTupleView.h"
22
#include <iostream>
33
#include <cstring>
4+
#include <limits>
45
#include <ROOT/RNTuple.hxx>
56
#include <ROOT/RNTupleReader.hxx>
67
#include <ROOT/RNTupleView.hxx>
@@ -19,66 +20,152 @@ Long64_t ramntupleview(const char *file, const char *query, bool cache, bool per
1920
printf("ramntupleview: failed to open file %s\n", file);
2021
return 0;
2122
}
22-
23+
2324
std::string region = query;
25+
if (region.empty() || region == "*") {
26+
stopwatch.Print();
27+
return reader->GetNEntries();
28+
}
29+
30+
TString rname;
31+
Int_t range_start = 1;
32+
Int_t range_end = std::numeric_limits<Int_t>::max();
33+
2434
int chrDelimiterPos = region.find(":");
2535
if (chrDelimiterPos == std::string::npos) {
26-
std::cerr << "Invalid region format. Use rname:start-end\n";
27-
return 0;
28-
}
29-
TString rname = region.substr(0, chrDelimiterPos);
30-
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
31-
if (rangeDelimiterPos == std::string::npos) {
32-
std::cerr << "Invalid region format. Use rname:start-end\n";
33-
return 0;
36+
rname = region;
37+
} else {
38+
rname = region.substr(0, chrDelimiterPos);
39+
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
40+
if (rangeDelimiterPos == std::string::npos) {
41+
try {
42+
range_start = std::stoi(region.substr(chrDelimiterPos + 1));
43+
range_end = range_start;
44+
} catch (...) {
45+
std::cerr << "Invalid region format. Use rname:start-end or rname:position\n";
46+
return 0;
47+
}
48+
} else {
49+
range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
50+
range_end = std::stoi(region.substr(rangeDelimiterPos + 1));
51+
}
3452
}
35-
Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
36-
Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1));
3753

38-
auto refid = RAMNTupleRecord::GetRnameRefs()->GetRefId(rname.Data());
54+
auto refs = RAMNTupleRecord::GetRnameRefs();
55+
auto refid = refs->GetRefId(rname.Data());
56+
57+
if (refid < 0) {
58+
if (rname.BeginsWith("chr")) {
59+
TString stripped_rname = rname(3, rname.Length() - 3);
60+
refid = refs->GetRefId(stripped_rname.Data());
61+
}
62+
if (refid < 0 && (rname == "chrM" || rname == "M")) {
63+
refid = refs->GetRefId("MT");
64+
}
65+
}
66+
67+
if (refid < 0) {
68+
std::cerr << "Error: Reference name '" << rname.Data() << "' not found in file.\n";
69+
return 0;
70+
}
71+
3972
auto index = RAMNTupleRecord::GetIndex();
40-
41-
auto recordView = reader->GetView<RAMNTupleRecord>("record");
73+
auto flagView = reader->GetView<uint16_t>("record.flag");
74+
auto refidView = reader->GetView<int32_t>("record.refid");
75+
auto posView = reader->GetView<int32_t>("record.pos");
76+
auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar");
4277

4378
Long64_t count = 0;
79+
const Long64_t totalEntries = reader->GetNEntries();
80+
81+
const int FLAG_FILTER = 0x904;
82+
const Int_t rs0 = (range_start > 0) ? (range_start - 1) : 0;
83+
const Int_t re0 = (range_end > 0) ? (range_end - 1) : 0;
84+
constexpr int kMaxRefSpanHeuristic = 10000;
4485

45-
if (!index || index->Size() == 0) {
86+
auto computeRefSpan = [](const std::vector<uint32_t> &cigarOps) -> int {
87+
int span = 0;
88+
for (uint32_t op : cigarOps) {
89+
int len = static_cast<int>(op >> 4);
90+
int code = static_cast<int>(op & 0xF);
91+
92+
if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) {
93+
span += len;
94+
}
95+
}
96+
return span;
97+
};
4698

99+
if (!index || index->Size() == 0) {
100+
101+
bool seenRef = false;
47102
for (auto i : reader->GetEntryRange()) {
48-
const auto &rec = recordView(i);
49-
if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) {
103+
const auto flag = flagView(i);
104+
if (flag & FLAG_FILTER) continue;
105+
106+
const auto curRef = refidView(i);
107+
if (curRef == refid) {
108+
seenRef = true;
109+
} else {
110+
if (seenRef && curRef > refid) break;
111+
continue;
112+
}
113+
const auto curPos = posView(i);
114+
if (curPos > re0) break;
115+
116+
int read_start = curPos;
117+
if (read_start >= rs0) {
118+
count++;
119+
continue;
120+
}
121+
if (read_start + kMaxRefSpanHeuristic < rs0) {
122+
continue;
123+
}
124+
const auto &cigarOps = cigarView(i);
125+
int refSpan = computeRefSpan(cigarOps);
126+
int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start;
127+
128+
if (read_start <= re0 && read_end >= rs0) {
50129
count++;
51130
}
52131
}
53132
} else {
133+
134+
auto start_entry = index->GetRow(refid, rs0);
135+
136+
if (start_entry < 0) start_entry = index->GetRow(refid, 0);
137+
if (start_entry < 0) start_entry = 0;
54138

55-
auto start_entry = index->GetRow(refid, range_start);
56-
auto end_entry = index->GetRow(refid, range_end);
57-
if (start_entry < 0)
58-
start_entry = 0;
59-
if (end_entry < 0)
60-
end_entry = reader->GetNEntries();
61-
62-
for (; start_entry < end_entry; start_entry++) {
63-
const auto &rec = recordView(start_entry);
64-
int seqlen = rec.GetSEQLEN();
65-
if (rec.pos + seqlen > range_start - 1) {
66-
break;
139+
for (Long64_t j = start_entry; j < totalEntries; j++) {
140+
const auto flag = flagView(j);
141+
if (flag & FLAG_FILTER) continue;
142+
143+
const auto curRef = refidView(j);
144+
if (curRef < refid) continue;
145+
if (curRef > refid) break;
146+
147+
const auto curPos = posView(j);
148+
if (curPos > re0) break;
149+
150+
int read_start = curPos;
151+
if (read_start >= rs0) {
152+
count++;
153+
continue;
67154
}
68-
}
69-
70-
Long64_t j;
71-
for (j = start_entry; j < reader->GetNEntries(); j++) {
72-
const auto &rec = recordView(j);
73-
74-
if (rec.pos >= range_end) {
75-
break;
155+
if (read_start + kMaxRefSpanHeuristic < rs0) {
156+
continue;
157+
}
158+
const auto &cigarOps = cigarView(j);
159+
int refSpan = computeRefSpan(cigarOps);
160+
int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start;
161+
162+
if (read_end >= rs0) {
163+
count++;
76164
}
77-
count++;
78165
}
79166
}
80167

81168
stopwatch.Print();
82-
83169
return count;
84170
}
171+

0 commit comments

Comments
 (0)