Skip to content

Commit 6c7d856

Browse files
committed
Fixed the Dockerfile for AMD ROCm
1 parent 9fd0cf8 commit 6c7d856

File tree

2 files changed

+98
-184
lines changed

2 files changed

+98
-184
lines changed

docker/rocm/Dockerfile

Lines changed: 83 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ LABEL ubuntu.version="22.04"
1313
# Avoid interactive prompts during package installation
1414
ARG DEBIAN_FRONTEND=noninteractive
1515

16-
# Update and install essential development tools
16+
# Install essential development tools for GPU programming
1717
RUN apt-get update && apt-get install -y \
18-
# Basic development tools
18+
# Core development tools
1919
build-essential \
2020
cmake \
2121
git \
@@ -25,17 +25,13 @@ RUN apt-get update && apt-get install -y \
2525
nano \
2626
htop \
2727
tree \
28-
# Python development
28+
# Minimal Python for basic scripting (not data science)
2929
python3 \
3030
python3-pip \
3131
python3-dev \
3232
# Additional utilities
3333
pkg-config \
3434
software-properties-common \
35-
apt-transport-https \
36-
ca-certificates \
37-
gnupg \
38-
lsb-release \
3935
# Debugging and profiling tools
4036
gdb \
4137
valgrind \
@@ -45,35 +41,21 @@ RUN apt-get update && apt-get install -y \
4541
iputils-ping \
4642
&& rm -rf /var/lib/apt/lists/*
4743

48-
# Install ROCm development packages
44+
# Install core ROCm development packages (keep minimal)
4945
RUN apt-get update && apt-get install -y \
50-
# Core ROCm packages
51-
rocm-dev \
52-
rocm-libs \
46+
# Core ROCm packages for GPU programming
5347
hip-dev \
5448
hip-samples \
5549
hipblas-dev \
56-
hipfft-dev \
57-
hipsparse-dev \
58-
# ROCm profiling and debugging tools
50+
# ROCm profiling tools (essential for performance work)
5951
rocprofiler-dev \
6052
roctracer-dev \
61-
roctx \
62-
# Additional ROCm libraries
63-
rocrand-dev \
64-
rocthrust-dev \
6553
&& rm -rf /var/lib/apt/lists/*
6654

67-
# Install Python packages for data analysis and visualization
55+
# Install minimal Python packages for basic development (no heavy data science libs)
6856
RUN pip3 install --no-cache-dir \
6957
numpy \
70-
matplotlib \
71-
seaborn \
72-
pandas \
73-
jupyter \
74-
jupyterlab \
75-
plotly \
76-
scipy
58+
matplotlib
7759

7860
# Set up ROCm environment variables
7961
ENV ROCM_PATH=/opt/rocm
@@ -84,11 +66,8 @@ ENV HIP_PLATFORM=amd
8466
ENV HSA_OVERRIDE_GFX_VERSION=11.0.0
8567
ENV ROCM_VERSION=6.4.3
8668

87-
# Add ROCm binaries to PATH
88-
ENV PATH=/opt/rocm/bin:/opt/rocm/hip/bin:${PATH}
89-
90-
# Verify ROCm installation
91-
RUN hipcc --version && rocminfo > /dev/null 2>&1 || echo "ROCm info check completed (may fail without GPU)"
69+
# Verify HIP compiler installation (skip rocminfo as no GPU during build)
70+
RUN hipcc --version
9271

9372
# Create development workspace
9473
WORKDIR /workspace
@@ -107,164 +86,90 @@ RUN echo 'alias ll="ls -alF"' >> /root/.bashrc && \
10786
echo 'alias rocm-info="rocminfo"' >> /root/.bashrc && \
10887
echo 'export PS1="\[\e[1;34m\][ROCm-DEV]\[\e[0m\] \w $ "' >> /root/.bashrc
10988

110-
# Create a comprehensive GPU test script
111-
RUN cat > /workspace/test-gpu.sh << 'EOF'
112-
#!/bin/bash
113-
echo "=== GPU Programming 101 - ROCm Environment Test ==="
114-
echo "Date: $(date)"
115-
echo ""
116-
117-
echo "=== HIP Compiler ==="
118-
hipcc --version
119-
echo ""
120-
121-
echo "=== ROCm Version ==="
122-
if command -v rocminfo > /dev/null 2>&1; then
123-
rocminfo | head -20
124-
else
125-
echo "rocminfo command not available"
126-
fi
127-
echo ""
128-
129-
echo "=== GPU Information ==="
130-
if command -v rocm-smi > /dev/null 2>&1; then
131-
rocm-smi --showproductname --showmeminfo vram || echo "No AMD GPU detected or accessible"
132-
else
133-
echo "rocm-smi not available"
134-
fi
135-
echo ""
136-
137-
echo "=== Environment Variables ==="
138-
echo "ROCM_PATH: $ROCM_PATH"
139-
echo "HIP_PATH: $HIP_PATH"
140-
echo "HIP_PLATFORM: $HIP_PLATFORM"
141-
echo "PATH: $PATH"
142-
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
143-
echo ""
144-
145-
echo "=== HIP Platform Detection ==="
146-
cat > /tmp/platform_test.cpp << 'HIP_EOF'
147-
#include <hip/hip_runtime.h>
148-
#include <iostream>
149-
150-
int main() {
151-
int deviceCount;
152-
hipError_t error = hipGetDeviceCount(&deviceCount);
153-
154-
if (error != hipSuccess) {
155-
std::cout << "HIP Error: " << hipGetErrorString(error) << std::endl;
156-
std::cout << "This may be normal if no GPU is available" << std::endl;
157-
} else {
158-
std::cout << "Number of HIP devices: " << deviceCount << std::endl;
159-
160-
for (int i = 0; i < deviceCount; i++) {
161-
hipDeviceProp_t props;
162-
hipGetDeviceProperties(&props, i);
163-
std::cout << "Device " << i << ": " << props.name << std::endl;
164-
}
165-
}
166-
167-
return 0;
168-
}
169-
HIP_EOF
170-
171-
echo "Compiling platform detection test..."
172-
if hipcc -o /tmp/platform_test /tmp/platform_test.cpp; then
173-
echo "✓ Compilation successful"
174-
echo "Running platform test:"
175-
/tmp/platform_test
176-
else
177-
echo "✗ Platform test compilation failed"
178-
fi
179-
180-
echo ""
181-
echo "=== Build Test ==="
182-
cd /tmp
183-
cat > test.hip.cpp << 'HIP_EOF'
184-
#include <hip/hip_runtime.h>
185-
#include <stdio.h>
186-
187-
__global__ void hello() {
188-
printf("Hello from HIP thread %d!\n", hipThreadIdx_x);
189-
}
190-
191-
int main() {
192-
printf("HIP Test Program\n");
193-
194-
// Check for HIP devices
195-
int deviceCount;
196-
hipError_t error = hipGetDeviceCount(&deviceCount);
197-
198-
if (error == hipSuccess && deviceCount > 0) {
199-
printf("Found %d HIP device(s)\n", deviceCount);
200-
hello<<<1, 5>>>();
201-
hipDeviceSynchronize();
202-
printf("GPU kernel completed!\n");
203-
} else {
204-
printf("No HIP devices found or error: %s\n", hipGetErrorString(error));
205-
printf("This is normal when running without GPU access\n");
206-
}
207-
208-
return 0;
209-
}
210-
HIP_EOF
211-
212-
echo "Compiling test HIP program..."
213-
if hipcc -o test test.hip.cpp; then
214-
echo "✓ Compilation successful"
215-
echo "Running test program:"
216-
./test
217-
echo "✓ HIP environment is working correctly!"
218-
else
219-
echo "✗ Compilation failed"
220-
exit 1
221-
fi
222-
223-
rm -f test test.hip.cpp platform_test platform_test.cpp
224-
echo ""
225-
echo "=== All tests completed ==="
226-
EOF
89+
# Create a simple GPU test script
90+
RUN printf '#!/bin/bash\n\
91+
echo "=== GPU Programming 101 - ROCm Environment Test ==="\n\
92+
echo "Date: $(date)"\n\
93+
echo ""\n\
94+
\n\
95+
echo "=== HIP Compiler ==="\n\
96+
hipcc --version\n\
97+
echo ""\n\
98+
\n\
99+
echo "=== GPU Information ==="\n\
100+
if rocm-smi --showproductname --showmeminfo vram 2>/dev/null; then\n\
101+
echo "AMD GPU detected successfully"\n\
102+
else\n\
103+
echo "No AMD GPU detected or rocm-smi not available"\n\
104+
fi\n\
105+
echo ""\n\
106+
\n\
107+
echo "=== Environment Variables ==="\n\
108+
echo "ROCM_PATH: $ROCM_PATH"\n\
109+
echo "HIP_PATH: $HIP_PATH"\n\
110+
echo "HIP_PLATFORM: $HIP_PLATFORM"\n\
111+
echo "PATH: $PATH"\n\
112+
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"\n\
113+
echo ""\n\
114+
\n\
115+
echo "=== Build Test ==="\n\
116+
cd /tmp\n\
117+
cat > test.hip.cpp << '"'"'HIP_EOF'"'"'\n\
118+
#include <hip/hip_runtime.h>\n\
119+
#include <stdio.h>\n\
120+
\n\
121+
__global__ void hello() {\n\
122+
printf("Hello from HIP thread %%d!\\n", hipThreadIdx_x);\n\
123+
}\n\
124+
\n\
125+
int main() {\n\
126+
printf("HIP Test Program\\n");\n\
127+
\n\
128+
int deviceCount;\n\
129+
hipError_t error = hipGetDeviceCount(&deviceCount);\n\
130+
\n\
131+
if (error != hipSuccess) {\n\
132+
printf("HIP Error: %%s\\n", hipGetErrorString(error));\n\
133+
printf("No HIP-capable devices found\\n");\n\
134+
return 0;\n\
135+
}\n\
136+
\n\
137+
printf("Found %%d HIP device(s)\\n", deviceCount);\n\
138+
hello<<<1, 5>>>();\n\
139+
hipDeviceSynchronize();\n\
140+
printf("GPU kernel completed!\\n");\n\
141+
return 0;\n\
142+
}\n\
143+
HIP_EOF\n\
144+
\n\
145+
echo "Compiling test HIP program..."\n\
146+
if hipcc -o test test.hip.cpp; then\n\
147+
echo "✓ Compilation successful"\n\
148+
echo "Running test program:"\n\
149+
./test\n\
150+
echo "✓ HIP environment is working correctly!"\n\
151+
else\n\
152+
echo "✗ Compilation failed"\n\
153+
exit 1\n\
154+
fi\n\
155+
\n\
156+
rm -f test test.hip.cpp\n\
157+
echo ""\n\
158+
echo "=== All tests completed ==="\n' > /workspace/test-gpu.sh
227159

228160
RUN chmod +x /workspace/test-gpu.sh
229161

230-
# Install HIP samples
162+
# Install HIP samples for learning and reference
231163
RUN cd /workspace && \
232164
if [ -d "/opt/rocm/hip/samples" ]; then \
233165
cp -r /opt/rocm/hip/samples ./hip-samples; \
234166
else \
235167
git clone https://github.com/ROCm-Developer-Tools/HIP-Examples.git hip-examples; \
236168
fi
237169

238-
# Create jupyter kernel for HIP (for notebooks)
239-
RUN python3 -m ipykernel install --name hip-kernel --display-name "HIP Python"
240-
241-
# Set up HIP for both AMD and NVIDIA compatibility
242-
RUN cat > /workspace/setup-hip-nvidia.sh << 'EOF'
243-
#!/bin/bash
244-
# Switch HIP to NVIDIA backend (for systems with NVIDIA GPUs)
245-
export HIP_PLATFORM=nvidia
246-
export HIP_COMPILER=nvcc
247-
echo "HIP configured for NVIDIA backend"
248-
echo "HIP_PLATFORM=$HIP_PLATFORM"
249-
EOF
250-
251-
RUN cat > /workspace/setup-hip-amd.sh << 'EOF'
252-
#!/bin/bash
253-
# Switch HIP to AMD backend (default)
254-
export HIP_PLATFORM=amd
255-
unset HIP_COMPILER
256-
echo "HIP configured for AMD backend"
257-
echo "HIP_PLATFORM=$HIP_PLATFORM"
258-
EOF
259-
260-
RUN chmod +x /workspace/setup-hip-*.sh
261-
262-
# Expose Jupyter port
263-
EXPOSE 8888
264-
265170
# Default command
266171
CMD ["/bin/bash"]
267172

268-
# Health check to verify ROCm access
173+
# Health check to verify HIP compiler access (will only work when GPU is available)
269174
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
270-
CMD rocminfo > /dev/null 2>&1 || hipcc --version > /dev/null 2>&1 || exit 1
175+
CMD hipcc --version > /dev/null 2>&1 || exit 1

docker/scripts/run.sh

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -274,15 +274,15 @@ run_rocm() {
274274
"${cmd[@]}"
275275
}
276276

277-
# Run with docker-compose
277+
# Run with docker compose (v2) or docker-compose (v1)
278278
run_compose() {
279279
local service=$1
280280
shift
281281

282-
log "Starting $service using docker-compose..."
282+
log "Starting $service using docker compose..."
283283
cd "$DOCKER_DIR"
284284

285-
# Parse arguments for docker-compose
285+
# Parse arguments for docker compose
286286
local compose_args=()
287287
while [[ $# -gt 0 ]]; do
288288
case $1 in
@@ -297,7 +297,15 @@ run_compose() {
297297
esac
298298
done
299299

300-
docker-compose up "${compose_args[@]}" "$service"
300+
# Try docker compose (v2) first, then fall back to docker-compose (v1)
301+
if docker compose up "${compose_args[@]}" "$service" 2>/dev/null; then
302+
log "Started $service using docker compose (v2)"
303+
elif docker-compose up "${compose_args[@]}" "$service" 2>/dev/null; then
304+
log "Started $service using docker-compose (v1)"
305+
else
306+
error "Failed to start $service using docker compose"
307+
return 1
308+
fi
301309
}
302310

303311
# Show usage
@@ -310,7 +318,7 @@ Usage: $0 [PLATFORM] [OPTIONS]
310318
Platforms:
311319
cuda Run NVIDIA CUDA container
312320
rocm Run AMD ROCm container
313-
compose SERVICE Run using docker-compose
321+
compose SERVICE Run using docker compose
314322
315323
Options:
316324
-h, --help Show this help message
@@ -326,7 +334,7 @@ Examples:
326334
$0 cuda --detach Run CUDA container in background
327335
$0 rocm --no-gpu Run ROCm container in CPU-only mode
328336
$0 --auto Auto-detect GPU type and run appropriate container
329-
$0 compose cuda-dev Run using docker-compose
337+
$0 compose cuda-dev Run using docker compose
330338
331339
Container Management:
332340
List containers: docker ps -a
@@ -339,6 +347,7 @@ GPU Programming Setup:
339347
Inside container: /workspace/test-gpu.sh # Test GPU environment
340348
Build examples: cd modules/module1/examples && make
341349
CUDA samples: cd /workspace/cuda-samples
350+
HIP samples: cd /workspace/hip-examples
342351
343352
EOF
344353
}

0 commit comments

Comments
 (0)