diff --git a/bin/cuda/gpu_benchmark.cu b/bin/cuda/gpu_benchmark.cu
index fe459e0c..740cdbc9 100644
--- a/bin/cuda/gpu_benchmark.cu
+++ b/bin/cuda/gpu_benchmark.cu
@@ -29,7 +29,7 @@ float getElapsedTime(const cudaEvent_t &gpu_start, cudaEvent_t &gpu_stop) {
 // Function to run the GPU benchmark with no time limit
 void runBenchmark(long max_work) {
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   unsigned long long int *d_count;
   curandState *d_state;
@@ -88,7 +88,7 @@ void runBenchmark(long max_work) {
 void runBenchmarkTime(long max_work, int runtime_in_seconds) {
 
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   // allocate memory
   unsigned long long int *d_count;
diff --git a/bin/hip/gpu_benchmark.hip b/bin/hip/gpu_benchmark.hip
index 93c15ff8..d3612504 100644
--- a/bin/hip/gpu_benchmark.hip
+++ b/bin/hip/gpu_benchmark.hip
@@ -27,7 +27,7 @@ float getElapsedTime(const hipEvent_t &gpu_start, hipEvent_t &gpu_stop) {
 // Function to run the GPU benchmark with no time limit
 void runBenchmark(long max_work) {
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   unsigned long long int *d_count;
   hiprandState *d_state;
@@ -86,7 +86,7 @@ void runBenchmark(long max_work) {
 void runBenchmarkTime(long max_work, int runtime_in_seconds) {
 
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   // allocate memory
   unsigned long long int *d_count;
diff --git a/bin/wfbench b/bin/wfbench
index 90980e46..01e2865d 100755
--- a/bin/wfbench
+++ b/bin/wfbench
@@ -20,6 +20,7 @@ import json
 import logging
 import pandas as pd
 import psutil
+import shutil
 
 from io import StringIO
 from filelock import FileLock
@@ -230,11 +231,21 @@ class GPUBenchmark:
 
     @staticmethod
     def get_available_gpus():
-        proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
-        stdout, _ = proc.communicate()
-        df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
-        return df[df["utilization.gpu"] <= 5].index.to_list()
+        if shutil.which("nvidia-smi") is not None:
+            proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+            stdout, _ = proc.communicate()
+            df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
+            return df[df["utilization.gpu"] <= 5].index.to_list()
+        elif shutil.which("amd-smi") is not None:
+            proc = subprocess.Popen(["amd-smi", "monitor", "-u", "--csv"], stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+            stdout, _ = proc.communicate()
+            df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=",")
+            return df[df["gfx"] <= 5].index.to_list()
+        else:
+            log_error("No supported GPU monitoring tool found.")
+            return []
 
     def __init__(self):
         self.work = None
@@ -261,13 +272,20 @@ class GPUBenchmark:
 
         if self.duration is not None:
             log_debug(f"Running GPU benchmark for {self.duration} seconds")
-            gpu_prog = [
-                f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
+            if shutil.which("nvidia-smi") is not None:
+                gpu_prog = [
+                    f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
+            else:
+                gpu_prog = [
+                    f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
         else:
             log_debug(f"Running GPU benchmark for {self.work} units of work")
-            gpu_prog = [
-                f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
-
+            if shutil.which("nvidia-smi") is not None:
+                gpu_prog = [
+                    f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
+            else:
+                gpu_prog = [
+                    f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
         p = subprocess.Popen(gpu_prog, shell=True)
         return ProcessHandle(p)