5454# Add detected GPU architecture to HIP flags
5555HIP_FLAGS += --offload-arch=$(GPU_ARCH )
5656HIP_DEBUG_FLAGS += --offload-arch=$(GPU_ARCH )
57+
58+ # ROCm library linking for advanced examples
59+ HIP_LIB_DIR := $(ROCM_PATH ) /lib
60+ HIP_LDFLAGS := -L$(HIP_LIB_DIR ) -lrocblas -Wl,-rpath,$(HIP_LIB_DIR )
61+
5762CXX_FLAGS = -std=c++17 -O2
5863
5964# Directories
@@ -129,7 +134,7 @@ endif
129134ifeq ($(BUILD_HIP ) ,1)
130135$(BUILD_DIR ) /% _hip : $(EXAMPLES_DIR ) /% _hip.cpp
131136 @echo " Building HIP example: $@ "
132- $(HIPCC ) $(HIP_FLAGS ) $< -o $@
137+ $(HIPCC ) $(HIP_FLAGS ) $< -o $@ $( HIP_LDFLAGS )
133138endif
134139
135140# Debug builds
@@ -138,6 +143,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS)
138143debug : HIP_FLAGS = $(HIP_DEBUG_FLAGS )
139144debug : all
140145
146+ # Profile builds
147+ .PHONY : profile
148+ profile : CUDA_FLAGS += -lineinfo
149+ profile : HIP_FLAGS += -g
150+ profile : all
151+ @echo " Generating profile data..."
152+ @mkdir -p $(PROFILE_DIR )
153+ ifeq ($(BUILD_HIP ) ,1)
154+ @echo "Running HIP profiling..."
155+ @for target in $(HIP_TARGETS); do \
156+ if [ -f $$target ]; then \
157+ echo "Profiling $$target..."; \
158+ rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \
159+ fi; \
160+ done
161+ endif
162+ ifeq ($(BUILD_CUDA ) ,1)
163+ @echo "Running CUDA profiling..."
164+ @for target in $(CUDA_TARGETS); do \
165+ if [ -f $$target ]; then \
166+ echo "Profiling $$target..."; \
167+ nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \
168+ fi; \
169+ done
170+ endif
171+
141172# Clean\n.PHONY: clean\nclean:\n\t@echo \"Cleaning build artifacts...\"\n\trm -rf $(BUILD_DIR) $(PROFILE_DIR)
142173
143174# Help
@@ -172,19 +203,19 @@ test_cuda: cuda
172203 @if command -v nvidia-smi > /dev/null; then \
173204 echo " === Testing Advanced Algorithm Examples ===" ; \
174205 echo " 1. Reduction Algorithms..." ; \
175- . /01_reduction_algorithms_cuda || echo " ✗ Reduction algorithms failed" ; \
206+ $( BUILD_DIR ) /01_reduction_algorithms_cuda || echo " ✗ Reduction algorithms failed" ; \
176207 echo " 2. Scan (Prefix Sum)..." ; \
177- . /02_scan_prefix_sum_cuda || echo " ✗ Scan algorithms failed" ; \
208+ $( BUILD_DIR ) /02_scan_prefix_sum_cuda || echo " ✗ Scan algorithms failed" ; \
178209 echo " 3. Sorting Algorithms..." ; \
179- . /03_sorting_algorithms_cuda || echo " ✗ Sorting algorithms failed" ; \
210+ $( BUILD_DIR ) /03_sorting_algorithms_cuda || echo " ✗ Sorting algorithms failed" ; \
180211 echo " 4. Convolution/Stencil..." ; \
181- . /04_convolution_stencil_cuda || echo " ✗ Convolution failed" ; \
212+ $( BUILD_DIR ) /04_convolution_stencil_cuda || echo " ✗ Convolution failed" ; \
182213 echo " 5. Matrix Operations..." ; \
183- . /05_matrix_operations_cuda || echo " ✗ Matrix operations failed" ; \
214+ $( BUILD_DIR ) /05_matrix_operations_cuda || echo " ✗ Matrix operations failed" ; \
184215 echo " 6. Graph Algorithms..." ; \
185- . /06_graph_algorithms_cuda || echo " ✗ Graph algorithms failed" ; \
216+ $( BUILD_DIR ) /06_graph_algorithms_cuda || echo " ✗ Graph algorithms failed" ; \
186217 echo " 7. Cooperative Groups..." ; \
187- . /07_cooperative_groups_cuda || echo " ✗ Cooperative groups failed" ; \
218+ $( BUILD_DIR ) /07_cooperative_groups_cuda || echo " ✗ Cooperative groups failed" ; \
188219 echo " ✓ Module 3 CUDA tests completed" ; \
189220 else \
190221 echo " No NVIDIA GPU detected, skipping CUDA tests" ; \
@@ -195,13 +226,13 @@ test_hip: hip
195226 @if command -v rocm-smi > /dev/null || command -v nvidia-smi > /dev/null; then \
196227 echo " === Testing HIP Algorithm Examples ===" ; \
197228 echo " 1. Reduction Algorithms..." ; \
198- . /01_reduction_algorithms_hip || echo " ✗ HIP reduction algorithms failed" ; \
229+ $( BUILD_DIR ) /01_reduction_algorithms_hip || echo " ✗ HIP reduction algorithms failed" ; \
199230 echo " 2. Scan (Prefix Sum)..." ; \
200- . /02_scan_prefix_sum_hip || echo " ✗ HIP scan algorithms failed" ; \
231+ $( BUILD_DIR ) /02_scan_prefix_sum_hip || echo " ✗ HIP scan algorithms failed" ; \
201232 echo " 3. Sorting Algorithms..." ; \
202- . /03_sorting_algorithms_hip || echo " ✗ HIP sorting algorithms failed" ; \
233+ $( BUILD_DIR ) /03_sorting_algorithms_hip || echo " ✗ HIP sorting algorithms failed" ; \
203234 echo " 4. Convolution/Stencil..." ; \
204- . /04_convolution_stencil_hip || echo " ✗ HIP convolution failed" ; \
235+ $( BUILD_DIR ) /04_convolution_stencil_hip || echo " ✗ HIP convolution failed" ; \
205236 echo " ✓ Module 3 HIP tests completed" ; \
206237 else \
207238 echo " No compatible GPU detected, skipping HIP tests" ; \
0 commit comments