diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 893d8ad29..ff17dfadb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -256,6 +256,9 @@ endif ()
 if (ENABLE_CUDA)
     find_package(CUDAToolkit REQUIRED)
     target_compile_definitions(mmseqs-framework PUBLIC -DHAVE_CUDA=1)
+    if (NOT DEFINED USE_GPU_SEM OR USE_GPU_SEM)
+        target_compile_definitions(mmseqs-framework PUBLIC -DUSE_GPU_SEM=1)
+    endif ()
     target_link_libraries(mmseqs-framework marv)
     if (FORCE_STATIC_DEPS)
         # link to rt explicitly so it doesn't get statically compiled and adds GLIBC_PRIVATE symbols
diff --git a/src/commons/GpuUtil.h b/src/commons/GpuUtil.h
index 1fa40e875..5689c32df 100644
--- a/src/commons/GpuUtil.h
+++ b/src/commons/GpuUtil.h
@@ -4,6 +4,14 @@
 #include <atomic>
 #include <string>
 #include <cstddef>
+#include <sched.h>
+#ifdef USE_GPU_SEM
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <semaphore.h>
+#include <fcntl.h>
+#endif
 #include "marv.h"
 
 struct GPUSharedMemory {
@@ -52,4 +60,44 @@ struct GPUSharedMemory {
 
 };
 
+struct GPUSharedMemorySem {
+#ifdef USE_GPU_SEM
+    sem_t* sem;
+    std::string shmName;
+
+    GPUSharedMemorySem() : sem(SEM_FAILED) {}
+    void create(const std::string& name) {
+        shmName = name;
+        std::string semName = "/" + name + "_sem";
+        sem_unlink(semName.c_str());
+        sem = sem_open(semName.c_str(), O_CREAT, 0660, 0);
+        if (sem == SEM_FAILED) {
+            perror(("sem_open(create) " + semName).c_str());
+            exit(EXIT_FAILURE);
+        }
+    }
+    void open(const std::string& name) {
+        shmName = name;
+        std::string semName = "/" + name + "_sem";
+        sem = sem_open(semName.c_str(), 0);
+        if (sem == SEM_FAILED) {
+            perror(("sem_open " + semName).c_str());
+            exit(EXIT_FAILURE);
+        }
+    }
+    void wait()  { while (sem_wait(sem) == -1 && errno == EINTR) {} }
+    void post()  { if (sem != SEM_FAILED) sem_post(sem); }
+    void close() { if (sem != SEM_FAILED) { sem_close(sem); sem = SEM_FAILED; } }
+    void destroy() { close(); sem_unlink(("/" + shmName + "_sem").c_str()); }
+#else
+    GPUSharedMemorySem() {}
+    void create(const std::string&) {}
+    void open(const std::string&) {}
+    void wait()    { sched_yield(); }
+    void post()    {}
+    void close()   {}
+    void destroy() {}
+#endif
+};
+
 #endif
diff --git a/src/prefiltering/ungappedprefilter.cpp b/src/prefiltering/ungappedprefilter.cpp
index e497835a3..981fda424 100644
--- a/src/prefiltering/ungappedprefilter.cpp
+++ b/src/prefiltering/ungappedprefilter.cpp
@@ -118,6 +118,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
     std::vector<size_t> offsets;
     std::vector<int32_t> lengths;
     GPUSharedMemory* layout = NULL;
+    GPUSharedMemorySem gpuSem;
     if (hash.empty()) {
         offsets.reserve(tdbr->getSize() + 1);
         lengths.reserve(tdbr->getSize());
@@ -130,6 +131,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
         lengthData = lengths.data();
     } else {
         layout = GPUSharedMemory::openSharedMemory(hash);
+        gpuSem.open(hash);
     }
 
     const bool serverMode = par.gpuServer;
@@ -220,6 +222,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
                     std::atomic_thread_fence(std::memory_order_release);
                     // Debug(Debug::ERROR) << "switch to ready\n";
                     layout->state.store(GPUSharedMemory::READY, std::memory_order_release);
+                    gpuSem.post();
 
                     while (true) {
                         if (layout->serverExit.load(std::memory_order_acquire) == true) {
@@ -323,6 +326,7 @@ void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
     if (marv != NULL) {
         delete marv;
     } else {
+        gpuSem.close();
         GPUSharedMemory::unmap(layout);
     }
 
diff --git a/src/util/gpuserver.cpp b/src/util/gpuserver.cpp
index 12b39ee86..a0f12524f 100644
--- a/src/util/gpuserver.cpp
+++ b/src/util/gpuserver.cpp
@@ -16,9 +16,16 @@
 #include <sys/mman.h>
 #include <signal.h>
 
+#ifdef HAVE_CUDA
+GPUSharedMemorySem gpuSemaphore;
+#endif
+
 volatile sig_atomic_t keepRunning = 1;
 void intHandler(int) {
     keepRunning = 0;
+#ifdef HAVE_CUDA
+    gpuSemaphore.post();
+#endif
 }
 
 int gpuserver(int argc, const char **argv, const Command& command) {
@@ -64,6 +71,12 @@ int gpuserver(int argc, const char **argv, const Command& command) {
     marv.setDb(h1);
     marv.prefetch();
 
+    std::string shmFile = GPUSharedMemory::getShmHash(par.db1);
+    GPUSharedMemory* layout = GPUSharedMemory::alloc(shmFile, par.maxSeqLen, par.maxResListLen);
+    Debug(Debug::WARNING) << shmFile << "\n";
+
+    gpuSemaphore.create(shmFile);
+
     struct sigaction act;
     memset(&act, 0, sizeof(act));
     act.sa_handler = intHandler;
@@ -72,10 +85,11 @@ int gpuserver(int argc, const char **argv, const Command& command) {
     sigaction(SIGINT, &act, NULL);
     sigaction(SIGTERM, &act, NULL);
 
-    std::string shmFile = GPUSharedMemory::getShmHash(par.db1);
-    GPUSharedMemory* layout = GPUSharedMemory::alloc(shmFile, par.maxSeqLen, par.maxResListLen);
-    Debug(Debug::WARNING) << shmFile << "\n";
     while (keepRunning) {
+        gpuSemaphore.wait();
+        if (!keepRunning) {
+            break;
+        }
         if (layout->state.load(std::memory_order_acquire) == GPUSharedMemory::READY) {
             std::atomic_thread_fence(std::memory_order_acquire);
 
@@ -85,8 +99,6 @@ int gpuserver(int argc, const char **argv, const Command& command) {
             std::atomic_thread_fence(std::memory_order_release);
             // Debug(Debug::ERROR) << "switch to done\n";
             layout->state.store(GPUSharedMemory::DONE, std::memory_order_release);
-        } else {
-            std::this_thread::yield();
         }
     }
 
@@ -95,6 +107,7 @@ int gpuserver(int argc, const char **argv, const Command& command) {
     layout->serverExit.store(true, std::memory_order_release);
     std::atomic_thread_fence(std::memory_order_release);
     GPUSharedMemory::dealloc(layout, shmFile);
+    gpuSemaphore.destroy();
 #endif
     return EXIT_SUCCESS;