Skip to content

Commit 2b9ac28

Browse files
committed
update outdated cudaMemAdvise/cudaMemPrefetchAsync calls to the new cudaMemLocation API
1 parent 5ba9023 commit 2b9ac28

File tree

5 files changed

+66
-36
lines changed

5 files changed

+66
-36
lines changed

modules/module2/content.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,19 +470,25 @@ __global__ void processData(float *data, size_t n) {
470470
```cuda
471471
void optimizedUnifiedMemory(float *data, size_t n, int device) {
472472
// Prefetch data to GPU before kernel launch
473-
cudaMemPrefetchAsync(data, n * sizeof(float), device);
473+
cudaMemLocation loc{};
474+
loc.type = cudaMemLocationTypeDevice;
475+
loc.id = device;
476+
cudaMemPrefetchAsync(data, n * sizeof(float), loc, /*stream=*/0);
474477
475478
// Set memory usage hints
476-
cudaMemAdvise(data, n * sizeof(float), cudaMemAdviseSetReadMostly, device);
477-
cudaMemAdvise(data, n * sizeof(float), cudaMemAdviseSetPreferredLocation, device);
479+
cudaMemAdvise(data, n * sizeof(float), cudaMemAdviseSetReadMostly, loc);
480+
cudaMemAdvise(data, n * sizeof(float), cudaMemAdviseSetPreferredLocation, loc);
478481
479482
// Launch kernel
480483
int blockSize = 256;
481484
int gridSize = (n + blockSize - 1) / blockSize;
482485
processData<<<gridSize, blockSize>>>(data, n);
483486
484487
// Prefetch back to CPU if needed
485-
cudaMemPrefetchAsync(data, n * sizeof(float), cudaCpuDeviceId);
488+
cudaMemLocation hostLoc{};
489+
hostLoc.type = cudaMemLocationTypeHost;
490+
hostLoc.id = 0;
491+
cudaMemPrefetchAsync(data, n * sizeof(float), hostLoc, /*stream=*/0);
486492
}
487493
```
488494

modules/module2/examples/04_unified_memory_cuda.cu

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -222,16 +222,23 @@ void demonstrateMemoryMigration() {
222222
}
223223

224224
int device = 0;
225+
// CUDA 13 updated UM APIs use cudaMemLocation instead of raw int device IDs
226+
cudaMemLocation locDevice{};
227+
locDevice.type = cudaMemLocationTypeDevice;
228+
locDevice.id = device;
229+
cudaMemLocation locHost{};
230+
locHost.type = cudaMemLocationTypeHost;
231+
locHost.id = 0; // host id is unused
225232

226233
printf("Testing memory migration with prefetching and hints...\n");
227234

228-
// Set memory advice
229-
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseSetReadMostly, device));
230-
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseSetPreferredLocation, device));
235+
// Set memory advice (location-aware in CUDA 13)
236+
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseSetReadMostly, locDevice));
237+
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseSetPreferredLocation, locDevice));
231238

232-
// Prefetch to GPU
239+
// Prefetch to GPU (location-aware + explicit stream)
233240
printf("Prefetching to GPU...\n");
234-
CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, device));
241+
CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, locDevice, 0));
235242
CUDA_CHECK(cudaDeviceSynchronize());
236243

237244
int blockSize = 256;
@@ -250,9 +257,9 @@ void demonstrateMemoryMigration() {
250257
float gpu_time;
251258
CUDA_CHECK(cudaEventElapsedTime(&gpu_time, start, stop));
252259

253-
// Prefetch to CPU
260+
// Prefetch to CPU (location-aware + explicit stream)
254261
printf("Prefetching to CPU...\n");
255-
CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, cudaCpuDeviceId));
262+
CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, locHost, 0));
256263
CUDA_CHECK(cudaDeviceSynchronize());
257264

258265
// CPU computation (data already on CPU)
@@ -274,9 +281,9 @@ void demonstrateMemoryMigration() {
274281
// Test without prefetching for comparison
275282
printf("\nTesting without prefetching...\n");
276283

277-
// Reset memory advice
278-
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseUnsetReadMostly, device));
279-
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseUnsetPreferredLocation, device));
284+
// Reset memory advice (location-aware in CUDA 13)
285+
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseUnsetReadMostly, locDevice));
286+
CUDA_CHECK(cudaMemAdvise(data, bytes, cudaMemAdviseUnsetPreferredLocation, locDevice));
280287

281288
CUDA_CHECK(cudaEventRecord(start));
282289
computeIntensive<<<gridSize, blockSize>>>(data, n);

modules/module4/README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -262,12 +262,15 @@ for (int chunk = 0; chunk < numChunks; chunk++) {
262262

263263
**Memory Hints:**
264264
```cuda
265-
// Guide data placement
266-
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, deviceId);
267-
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, deviceId);
268-
269-
// Prefetch data proactively
270-
cudaMemPrefetchAsync(data, size, deviceId);
265+
// Guide data placement (CUDA 13+)
266+
cudaMemLocation loc{};
267+
loc.type = cudaMemLocationTypeDevice;
268+
loc.id = deviceId;
269+
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, loc);
270+
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, loc);
271+
272+
// Prefetch data proactively (CUDA 13+)
273+
cudaMemPrefetchAsync(data, size, loc, /*stream=*/0);
271274
```
272275

273276
### 4. P2P Communication Patterns

modules/module4/content.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,14 @@ kernel<<<grid, block>>>(data, n);
135135
#### Memory Access Patterns
136136

137137
```cuda
138-
// Prefetch data to GPU
139-
cudaMemPrefetchAsync(data, size, deviceId);
140-
141-
// Provide memory access hints
142-
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, deviceId);
138+
// Prefetch data to GPU (CUDA 13+)
139+
cudaMemLocation loc{};
140+
loc.type = cudaMemLocationTypeDevice;
141+
loc.id = deviceId;
142+
cudaMemPrefetchAsync(data, size, loc, /*stream=*/0);
143+
144+
// Provide memory access hints (CUDA 13+)
145+
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, loc);
143146
```
144147

145148
#### Unified Memory Best Practices

modules/module4/examples/03_unified_memory.cu

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -204,23 +204,30 @@ double optimizedUnifiedMemory(int n) {
204204

205205
auto start = std::chrono::high_resolution_clock::now();
206206

207-
// Provide memory hints
207+
// Provide memory hints (CUDA 13: use cudaMemLocation)
208208
int deviceId = 0;
209-
CUDA_CHECK(cudaMemAdvise(a, bytes, cudaMemAdviseSetReadMostly, deviceId));
210-
CUDA_CHECK(cudaMemAdvise(b, bytes, cudaMemAdviseSetReadMostly, deviceId));
209+
cudaMemLocation locDevice{};
210+
locDevice.type = cudaMemLocationTypeDevice;
211+
locDevice.id = deviceId;
212+
cudaMemLocation locHost{};
213+
locHost.type = cudaMemLocationTypeHost;
214+
locHost.id = 0;
215+
216+
CUDA_CHECK(cudaMemAdvise(a, bytes, cudaMemAdviseSetReadMostly, locDevice));
217+
CUDA_CHECK(cudaMemAdvise(b, bytes, cudaMemAdviseSetReadMostly, locDevice));
211218

212-
// Prefetch data to GPU
213-
CUDA_CHECK(cudaMemPrefetchAsync(a, bytes, deviceId));
214-
CUDA_CHECK(cudaMemPrefetchAsync(b, bytes, deviceId));
219+
// Prefetch data to GPU (location-aware + explicit stream)
220+
CUDA_CHECK(cudaMemPrefetchAsync(a, bytes, locDevice, 0));
221+
CUDA_CHECK(cudaMemPrefetchAsync(b, bytes, locDevice, 0));
215222

216223
// Launch kernel
217224
dim3 block(BLOCK_SIZE);
218225
dim3 grid((n + block.x - 1) / block.x);
219226
vectorAdd<<<grid, block>>>(a, b, c, n);
220227
CUDA_CHECK(cudaGetLastError());
221228

222-
// Prefetch result back to CPU
223-
CUDA_CHECK(cudaMemPrefetchAsync(c, bytes, cudaCpuDeviceId));
229+
// Prefetch result back to CPU (location-aware + explicit stream)
230+
CUDA_CHECK(cudaMemPrefetchAsync(c, bytes, locHost, 0));
224231
CUDA_CHECK(cudaDeviceSynchronize());
225232

226233
// Access result on CPU
@@ -376,9 +383,13 @@ void multiGPUUnifiedMemory(int n) {
376383
int offset = gpu * chunkSize;
377384
int currentChunkSize = (gpu == deviceCount - 1) ? n - offset : chunkSize;
378385

379-
// Prefetch chunk to current GPU
380-
CUDA_CHECK(cudaMemPrefetchAsync(data + offset,
381-
currentChunkSize * sizeof(float), gpu));
386+
// Prefetch chunk to current GPU (location-aware + explicit stream)
387+
cudaMemLocation locGpu{};
388+
locGpu.type = cudaMemLocationTypeDevice;
389+
locGpu.id = gpu;
390+
CUDA_CHECK(cudaMemPrefetchAsync(data + offset,
391+
currentChunkSize * sizeof(float),
392+
locGpu, 0));
382393

383394
// Process on this GPU
384395
dim3 block(BLOCK_SIZE);

0 commit comments

Comments
 (0)