diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
index deb271cb6..c6704a8a1 100644
--- a/.github/workflows/workflow.yml
+++ b/.github/workflows/workflow.yml
@@ -32,7 +32,9 @@ jobs:
           fetch-depth: 1
 
       - name: Shallow submodule init
-        run: git submodule update --init --recursive --depth 1
+        run: |
+          git config --global --add safe.directory '*'
+          git submodule update --init --recursive --depth 1
 
       - name: Cache CMake build (riscv64)
         uses: actions/cache@v4
@@ -51,7 +53,9 @@ jobs:
         run: |
           for i in $(seq 1 $SYSTEM_TEST_RUNS); do
             echo "=== riscv64 System Test Run $i/$SYSTEM_TEST_RUNS ==="
-            if ! timeout 300 cmake --build build_riscv64 --target system_test_run; then
+            timeout 300 cmake --build build_riscv64 --target system_test_run > /tmp/st_out_$i.txt 2>&1 || true
+            cat /tmp/st_out_$i.txt
+            if ! grep -q "Failed: 0" /tmp/st_out_$i.txt; then
               echo "riscv64 system test run $i/$SYSTEM_TEST_RUNS FAILED"
               exit 1
             fi
@@ -80,7 +84,9 @@ jobs:
           fetch-depth: 1
 
       - name: Shallow submodule init
-        run: git submodule update --init --recursive --depth 1
+        run: |
+          git config --global --add safe.directory '*'
+          git submodule update --init --recursive --depth 1
 
       - name: Cache CMake build (aarch64)
         uses: actions/cache@v4
diff --git a/src/arch/aarch64/link.ld b/src/arch/aarch64/link.ld
index b49dbf0be..aa7ff4527 100644
--- a/src/arch/aarch64/link.ld
+++ b/src/arch/aarch64/link.ld
@@ -168,11 +168,9 @@ SECTIONS
    *(.dynbss)
    *(.bss .bss.* .gnu.linkonce.b.*)
    *(COMMON)
-   /* Align here to ensure that the .bss section occupies space up to
-      _end.  Align after .bss to ensure correct alignment even if the
-      .bss section disappears because there are no input sections.
-      FIXME: Why do we need it? When there is no .bss section, we do not
-      pad the .data section.  */
+    /* Align here to ensure that the .bss section occupies space up to
+       _end.  Align after .bss to ensure correct alignment even if the
+       .bss section disappears because there are no input sections.  */
    . = ALIGN(. != 0 ? 64 / 8 : 1);
   }
   _bss_end__ = .; __bss_end__ = .;
diff --git a/src/arch/riscv64/link.ld b/src/arch/riscv64/link.ld
index 4b2feeaad..dc24602df 100644
--- a/src/arch/riscv64/link.ld
+++ b/src/arch/riscv64/link.ld
@@ -178,10 +178,8 @@ SECTIONS {
         *(.bss .bss.* .gnu.linkonce.b.*)
         *(COMMON)
         /* Align here to ensure that the .bss section occupies space up to
-          _end.  Align after .bss to ensure correct alignment even if the
-          .bss section disappears because there are no input sections.
-          FIXME: Why do we need it? When there is no .bss section, we do not
-          pad the .data section.  */
+           _end.  Align after .bss to ensure correct alignment even if the
+           .bss section disappears because there are no input sections.  */
         . = ALIGN(. != 0 ? 64 / 8 : 1);
     }
     . = ALIGN(64 / 8);
diff --git a/src/task/AGENTS.md b/src/task/AGENTS.md
index ea1ab6edd..3ea4b19bc 100644
--- a/src/task/AGENTS.md
+++ b/src/task/AGENTS.md
@@ -41,7 +41,7 @@ mutex.cpp                  # Mutex implementation (uses SpinLock internally)
 - Schedulers own their internal run queues — TaskManager dispatches to per-policy schedulers
 - `TaskManagerSingleton::instance()` (defined in `task_manager.hpp`) is the global entry point
 - TCB contains arch-specific context pointer — populated by `switch.S`
-- TODO in `task_manager.cpp`: task stealing across cores (not yet implemented)
+- `Balance()` in `task_manager.cpp`: cross-core work-stealing (steals kNormal tasks from most-loaded core, called every 64 ticks)
 
 ## ANTI-PATTERNS
 - **DO NOT** call Schedule() before TaskManager initialization in boot sequence
diff --git a/src/task/task_manager.cpp b/src/task/task_manager.cpp
index 6286e12fc..e95467efa 100644
--- a/src/task/task_manager.cpp
+++ b/src/task/task_manager.cpp
@@ -179,8 +179,82 @@ auto TaskManager::FindTask(Pid pid) -> TaskControlBlock* {
 }
 
 auto TaskManager::Balance() -> void {
-  // 算法留空
-  // TODO: 检查其他核心的运行队列长度，如果比当前核心长，则窃取任务
+  auto current_core = cpu_io::GetCurrentCoreId();
+  auto& current_sched = cpu_schedulers_[current_core];
+
+  // 获取当前核心 kNormal 队列长度（无锁快速检查）
+  size_t current_load = 0;
+  if (current_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)]) {
+    current_load =
+        current_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)]
+            ->GetQueueSize();
+  }
+
+  // 寻找负载最高的核心
+  size_t max_load = 0;
+  size_t max_core = current_core;
+
+  for (size_t core_id = 0; core_id < SIMPLEKERNEL_MAX_CORE_COUNT; ++core_id) {
+    if (core_id == current_core) {
+      continue;
+    }
+    auto& other_sched = cpu_schedulers_[core_id];
+    if (other_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)]) {
+      size_t load =
+          other_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)]
+              ->GetQueueSize();
+      if (load > max_load) {
+        max_load = load;
+        max_core = core_id;
+      }
+    }
+  }
+
+  // 仅当差值 > 1 时才窃取（避免 ping-pong）
+  if (max_core == current_core || max_load <= current_load + 1) {
+    return;
+  }
+
+  // 按核心 ID 顺序获取锁，防止死锁
+  auto& source_sched = cpu_schedulers_[max_core];
+  size_t first_core = (current_core < max_core) ? current_core : max_core;
+  size_t second_core = (current_core < max_core) ? max_core : current_core;
+
+  LockGuard<SpinLock> lock_first(cpu_schedulers_[first_core].lock);
+  LockGuard<SpinLock> lock_second(cpu_schedulers_[second_core].lock);
+
+  // 重新检查（持锁后条件可能已变化）
+  auto* source_scheduler =
+      source_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)].get();
+  auto* dest_scheduler =
+      current_sched.schedulers[static_cast<uint8_t>(SchedPolicy::kNormal)]
+          .get();
+
+  if (!source_scheduler || !dest_scheduler) {
+    return;
+  }
+
+  size_t source_load = source_scheduler->GetQueueSize();
+  size_t dest_load = dest_scheduler->GetQueueSize();
+
+  if (source_load <= dest_load + 1) {
+    return;
+  }
+
+  auto* stolen = source_scheduler->PickNext();
+  if (!stolen) {
+    return;
+  }
+
+  if (stolen->aux && stolen->aux->cpu_affinity.value() != UINT64_MAX &&
+      !(stolen->aux->cpu_affinity.value() & (1UL << current_core))) {
+    source_scheduler->Enqueue(stolen);
+    return;
+  }
+
+  dest_scheduler->Enqueue(stolen);
+  klog::Debug("Balance: Stole task '{}' (pid={}) from core {} to core {}",
+              stolen->name, stolen->pid, max_core, current_core);
 }
 
 auto TaskManager::ReapTask(TaskControlBlock* task) -> void {
diff --git a/src/task/tick_update.cpp b/src/task/tick_update.cpp
index 095acc0e8..8671f41b8 100644
--- a/src/task/tick_update.cpp
+++ b/src/task/tick_update.cpp
@@ -67,6 +67,10 @@ auto TaskManager::TickUpdate() -> void {
     }
   }
 
+  if (cpu_sched.scheduler_started && (cpu_sched.local_tick % 64) == 0) {
+    Balance();
+  }
+
   if (need_preempt && cpu_sched.scheduler_started) {
     Schedule();
   }
diff --git a/tests/integration_test/aarch64_minimal/main.cpp b/tests/integration_test/aarch64_minimal/main.cpp
index 53ad12ed4..f79c0ef8b 100644
--- a/tests/integration_test/aarch64_minimal/main.cpp
+++ b/tests/integration_test/aarch64_minimal/main.cpp
@@ -1,6 +1,6 @@
 /**
  * @copyright Copyright The SimpleKernel Contributors
- * @brief TODO: Add description
+ * @brief AArch64 minimal boot test
  */
 
 #include <cstdarg>
diff --git a/tests/system_test/CMakeLists.txt b/tests/system_test/CMakeLists.txt
index 229eb22cd..07fe5725d 100644
--- a/tests/system_test/CMakeLists.txt
+++ b/tests/system_test/CMakeLists.txt
@@ -34,6 +34,7 @@ ADD_EXECUTABLE (
     tick_test.cpp
     zombie_reap_test.cpp
     stress_test.cpp
+    balance_test.cpp
     ${CMAKE_SOURCE_DIR}/src/syscall.cpp
     ${CMAKE_SOURCE_DIR}/src/io_buffer.cpp)
 
diff --git a/tests/system_test/balance_test.cpp b/tests/system_test/balance_test.cpp
new file mode 100644
index 000000000..fcba8bd12
--- /dev/null
+++ b/tests/system_test/balance_test.cpp
@@ -0,0 +1,243 @@
+/**
+ * @copyright Copyright The SimpleKernel Contributors
+ */
+
+#include <atomic>
+#include <cstdint>
+
+#include "basic_info.hpp"
+#include "kernel.h"
+#include "kernel_log.hpp"
+#include "kstd_libcxx.h"
+#include "kstd_memory"
+#include "per_cpu.hpp"
+#include "syscall.hpp"
+#include "system_test.h"
+#include "task_control_block.hpp"
+#include "task_manager.hpp"
+
+namespace {
+
+std::atomic<int> g_tests_completed{0};
+std::atomic<int> g_tests_failed{0};
+
+// ===========================================================================
+// test_balance_imbalanced_load
+//
+// AddTask() places unpinned tasks on the caller's current core, so spawning
+// many tasks from one core creates a natural imbalance. Balance() (called
+// every 64 ticks) should steal tasks to idle cores.
+//
+// Strategy:
+//   1. Spawn N long-lived unpinned workers (they sleep in a loop so Balance()
+//      has time to migrate them).
+//   2. Each worker periodically records which core it runs on via atomic OR.
+//   3. After all workers complete, verify the combined core mask has >= 2 bits
+//      set — meaning at least one task was migrated by Balance().
+//
+// Workers must live long enough for multiple Balance() intervals (64 ticks
+// each). Using sys_sleep(50) in a loop ensures they remain in the scheduler
+// queues across multiple ticks.
+// ===========================================================================
+
+constexpr int kImbalanceWorkerCount = 8;
+std::atomic<int> g_imbalance_done{0};
+std::atomic<uint64_t> g_cores_used_mask{0};
+
+void imbalance_worker(void* /*arg*/) {
+  // Stay alive across multiple Balance() intervals.
+  // Record core ID on each wakeup — after migration we'll see a different
+  // core.
+  for (int i = 0; i < 20; ++i) {
+    auto core_id = cpu_io::GetCurrentCoreId();
+    g_cores_used_mask.fetch_or(1UL << core_id, std::memory_order_relaxed);
+    (void)sys_sleep(10);
+  }
+
+  g_imbalance_done.fetch_add(1, std::memory_order_release);
+  sys_exit(0);
+}
+
+void test_balance_imbalanced_load(void* /*arg*/) {
+  klog::Info("=== Balance: Imbalanced Load Test ===");
+
+  g_imbalance_done = 0;
+  g_cores_used_mask = 0;
+  bool passed = true;
+
+  auto& tm = TaskManagerSingleton::instance();
+  auto* self = tm.GetCurrentTask();
+
+  // All workers are unpinned (default affinity = UINT64_MAX).
+  // AddTask() places them on the caller's current core, creating a heavy
+  // imbalance that Balance() should correct.
+  for (int i = 0; i < kImbalanceWorkerCount; ++i) {
+    auto task = kstd::make_unique<TaskControlBlock>("BalWorker", 10,
+                                                    imbalance_worker, nullptr);
+    task->aux->parent_pid = self->pid;
+    task->aux->pgid = self->aux->pgid;
+    tm.AddTask(std::move(task));
+  }
+
+  // Wait for all workers to finish
+  int timeout = 600;
+  while (timeout > 0 && g_imbalance_done.load(std::memory_order_acquire) <
+                            kImbalanceWorkerCount) {
+    (void)sys_sleep(50);
+    timeout--;
+  }
+
+  if (g_imbalance_done.load() != kImbalanceWorkerCount) {
+    klog::Err(
+        "test_balance_imbalanced_load: FAIL — only {}/{} workers finished",
+        g_imbalance_done.load(), kImbalanceWorkerCount);
+    passed = false;
+  }
+
+  // With 8 workers all initially on one core and Balance() active,
+  // tasks should have been migrated to run on more than one core.
+  uint64_t mask = g_cores_used_mask.load(std::memory_order_acquire);
+  int cores_used = __builtin_popcountll(mask);
+  if (cores_used < 2) {
+    klog::Err(
+        "test_balance_imbalanced_load: FAIL — tasks only used {} core(s), "
+        "expected >= 2 (mask={:#x})",
+        cores_used, mask);
+    passed = false;
+  } else {
+    klog::Info(
+        "test_balance_imbalanced_load: tasks used {} core(s) (mask={:#x})",
+        cores_used, mask);
+  }
+
+  if (passed) {
+    klog::Info("Balance: Imbalanced Load Test: PASSED");
+  } else {
+    klog::Err("Balance: Imbalanced Load Test: FAILED");
+    g_tests_failed++;
+  }
+
+  g_tests_completed++;
+  sys_exit(0);
+}
+
+// ===========================================================================
+// test_balance_respects_affinity
+//
+// Pin all tasks to core 0. Even though core 1 is idle, Balance() must NOT
+// migrate pinned tasks. Each worker records its core on every wakeup;
+// the combined mask must only contain core 0.
+// ===========================================================================
+
+constexpr int kAffinityWorkerCount = 4;
+std::atomic<int> g_affinity_done{0};
+std::atomic<uint64_t> g_affinity_cores_mask{0};
+
+void affinity_pinned_worker(void* /*arg*/) {
+  for (int i = 0; i < 10; ++i) {
+    auto core_id = cpu_io::GetCurrentCoreId();
+    g_affinity_cores_mask.fetch_or(1UL << core_id, std::memory_order_relaxed);
+    (void)sys_sleep(10);
+  }
+
+  g_affinity_done.fetch_add(1, std::memory_order_release);
+  sys_exit(0);
+}
+
+void test_balance_respects_affinity(void* /*arg*/) {
+  klog::Info("=== Balance: Respects Affinity Test ===");
+
+  g_affinity_done = 0;
+  g_affinity_cores_mask = 0;
+  bool passed = true;
+
+  auto& tm = TaskManagerSingleton::instance();
+  auto* self = tm.GetCurrentTask();
+
+  // Pin all workers to core 0
+  for (int i = 0; i < kAffinityWorkerCount; ++i) {
+    auto task = kstd::make_unique<TaskControlBlock>(
+        "BalPinned", 10, affinity_pinned_worker, nullptr);
+    task->aux->parent_pid = self->pid;
+    task->aux->pgid = self->aux->pgid;
+    task->aux->cpu_affinity = (1UL << 0);
+    tm.AddTask(std::move(task));
+  }
+
+  // Wait for all workers
+  int timeout = 400;
+  while (timeout > 0 && g_affinity_done.load(std::memory_order_acquire) <
+                            kAffinityWorkerCount) {
+    (void)sys_sleep(50);
+    timeout--;
+  }
+
+  if (g_affinity_done.load() != kAffinityWorkerCount) {
+    klog::Err(
+        "test_balance_respects_affinity: FAIL — only {}/{} workers finished",
+        g_affinity_done.load(), kAffinityWorkerCount);
+    passed = false;
+  }
+
+  // All tasks pinned to core 0 should only have run on core 0
+  uint64_t mask = g_affinity_cores_mask.load(std::memory_order_acquire);
+  if (mask != (1UL << 0)) {
+    klog::Err(
+        "test_balance_respects_affinity: FAIL — pinned tasks ran on "
+        "cores {:#x}, expected only core 0 (0x1)",
+        mask);
+    passed = false;
+  }
+
+  if (passed) {
+    klog::Info("Balance: Respects Affinity Test: PASSED");
+  } else {
+    klog::Err("Balance: Respects Affinity Test: FAILED");
+    g_tests_failed++;
+  }
+
+  g_tests_completed++;
+  sys_exit(0);
+}
+
+}  // namespace
+
+auto balance_test() -> bool {
+  klog::Info("===== Balance System Test Start =====");
+
+  auto core_count = BasicInfoSingleton::instance().core_count;
+  if (core_count < 2) {
+    klog::Info("Skipping balance tests: need >= 2 cores, have {}", core_count);
+    return true;
+  }
+
+  g_tests_completed = 0;
+  g_tests_failed = 0;
+
+  auto& tm = TaskManagerSingleton::instance();
+
+  auto t1 = kstd::make_unique<TaskControlBlock>(
+      "TestBalImbalance", 10, test_balance_imbalanced_load, nullptr);
+  tm.AddTask(std::move(t1));
+
+  auto t2 = kstd::make_unique<TaskControlBlock>(
+      "TestBalAffinity", 10, test_balance_respects_affinity, nullptr);
+  tm.AddTask(std::move(t2));
+
+  constexpr int kExpectedTests = 2;
+  int timeout = 600;
+  while (timeout > 0) {
+    (void)sys_sleep(50);
+    if (g_tests_completed.load() >= kExpectedTests) {
+      break;
+    }
+    timeout--;
+  }
+
+  EXPECT_EQ(g_tests_completed.load(), kExpectedTests,
+            "All balance sub-tests completed");
+  EXPECT_EQ(g_tests_failed.load(), 0, "No balance sub-tests failed");
+
+  klog::Info("===== Balance System Test End =====");
+  return true;
+}
diff --git a/tests/system_test/main.cpp b/tests/system_test/main.cpp
index 0f2c45b66..e1159b7e6 100644
--- a/tests/system_test/main.cpp
+++ b/tests/system_test/main.cpp
@@ -28,7 +28,7 @@ struct test_case {
   bool is_smp_test = false;
 };
 
-constexpr size_t kTestCount = 26;
+constexpr size_t kTestCount = 27;
 
 std::array<test_case, kTestCount> test_cases = {
     test_case{"ctor_dtor_test", ctor_dtor_test, false},
@@ -57,6 +57,7 @@ std::array<test_case, kTestCount> test_cases = {
     test_case{"stress_test", stress_test, false},
     test_case{"ramfs_test", ramfs_test, false},
     test_case{"fatfs_test", fatfs_test, false},
+    test_case{"balance_test", balance_test, false},
 };
 
 std::array<TestResult, kTestCount> test_results{};
diff --git a/tests/system_test/system_test.h b/tests/system_test/system_test.h
index 6817a1ee5..31f36209e 100644
--- a/tests/system_test/system_test.h
+++ b/tests/system_test/system_test.h
@@ -198,6 +198,7 @@ auto affinity_test() -> bool;
 auto tick_test() -> bool;
 auto zombie_reap_test() -> bool;
 auto stress_test() -> bool;
+auto balance_test() -> bool;
 
 // ===========================================================================
 // QEMU exit
diff --git a/tests/unit_test/CMakeLists.txt b/tests/unit_test/CMakeLists.txt
index 2ec54bbdd..77f27a18f 100644
--- a/tests/unit_test/CMakeLists.txt
+++ b/tests/unit_test/CMakeLists.txt
@@ -32,7 +32,8 @@ ADD_EXECUTABLE (
     ${CMAKE_SOURCE_DIR}/src/task/tick_update.cpp
     ${CMAKE_SOURCE_DIR}/src/task/wait.cpp
     virtio_driver_test.cpp
-    dma_region_test.cpp)
+    dma_region_test.cpp
+    balance_test.cpp)
 
 TARGET_COMPILE_DEFINITIONS (
     ${PROJECT_NAME}
diff --git a/tests/unit_test/balance_test.cpp b/tests/unit_test/balance_test.cpp
new file mode 100644
index 000000000..478ffb014
--- /dev/null
+++ b/tests/unit_test/balance_test.cpp
@@ -0,0 +1,164 @@
+/**
+ * @copyright Copyright The SimpleKernel Contributors
+ */
+
+#include <gtest/gtest.h>
+
+#include "rr_scheduler.hpp"
+#include "task_control_block.hpp"
+
+/// @brief Test that stealing from a loaded RR scheduler works correctly.
+/// This validates the core primitive that Balance() relies on:
+/// Dequeue from source scheduler, Enqueue to destination scheduler.
+TEST(BalanceTest, StealFromLoadedScheduler) {
+  RoundRobinScheduler source;
+  RoundRobinScheduler dest;
+
+  // Source has 4 tasks, dest has 0
+  TaskControlBlock task1("Task1", 1, nullptr, nullptr);
+  TaskControlBlock task2("Task2", 2, nullptr, nullptr);
+  TaskControlBlock task3("Task3", 3, nullptr, nullptr);
+  TaskControlBlock task4("Task4", 4, nullptr, nullptr);
+
+  source.Enqueue(&task1);
+  source.Enqueue(&task2);
+  source.Enqueue(&task3);
+  source.Enqueue(&task4);
+
+  EXPECT_EQ(source.GetQueueSize(), 4);
+  EXPECT_EQ(dest.GetQueueSize(), 0);
+
+  // Simulate steal: pick from source, enqueue to dest
+  auto* stolen = source.PickNext();
+  ASSERT_NE(stolen, nullptr);
+  dest.Enqueue(stolen);
+
+  EXPECT_EQ(source.GetQueueSize(), 3);
+  EXPECT_EQ(dest.GetQueueSize(), 1);
+}
+
+/// @brief Test that no stealing occurs when queues are balanced.
+TEST(BalanceTest, NoStealWhenBalanced) {
+  RoundRobinScheduler sched1;
+  RoundRobinScheduler sched2;
+
+  TaskControlBlock task1("Task1", 1, nullptr, nullptr);
+  TaskControlBlock task2("Task2", 2, nullptr, nullptr);
+
+  sched1.Enqueue(&task1);
+  sched2.Enqueue(&task2);
+
+  // Both have 1 task — balanced, no steal needed
+  EXPECT_EQ(sched1.GetQueueSize(), 1);
+  EXPECT_EQ(sched2.GetQueueSize(), 1);
+
+  // Difference is 0, which is <= 1 threshold — no steal
+  size_t diff = sched1.GetQueueSize() > sched2.GetQueueSize()
+                    ? sched1.GetQueueSize() - sched2.GetQueueSize()
+                    : sched2.GetQueueSize() - sched1.GetQueueSize();
+  EXPECT_LE(diff, 1);
+}
+
+/// @brief Test that stealing only picks from kNormal (RR) queues.
+/// kRealTime (FIFO) and kIdle tasks should never be stolen.
+TEST(BalanceTest, OnlyStealNormalPolicyTasks) {
+  RoundRobinScheduler rr_source;
+
+  // Only tasks with kNormal policy go into RR scheduler
+  TaskControlBlock normal_task("NormalTask", 5, nullptr, nullptr);
+  normal_task.policy = SchedPolicy::kNormal;
+
+  rr_source.Enqueue(&normal_task);
+  EXPECT_EQ(rr_source.GetQueueSize(), 1);
+
+  auto* stolen = rr_source.PickNext();
+  ASSERT_NE(stolen, nullptr);
+  EXPECT_EQ(stolen->policy, SchedPolicy::kNormal);
+}
+
+/// @brief Test steal-one-at-a-time semantics.
+TEST(BalanceTest, StealOnlyOneTask) {
+  RoundRobinScheduler source;
+
+  TaskControlBlock task1("Task1", 1, nullptr, nullptr);
+  TaskControlBlock task2("Task2", 2, nullptr, nullptr);
+  TaskControlBlock task3("Task3", 3, nullptr, nullptr);
+
+  source.Enqueue(&task1);
+  source.Enqueue(&task2);
+  source.Enqueue(&task3);
+
+  // Steal exactly one
+  auto* stolen = source.PickNext();
+  ASSERT_NE(stolen, nullptr);
+
+  // Source should still have 2 tasks
+  EXPECT_EQ(source.GetQueueSize(), 2);
+}
+
+/// @brief Test that stealing from empty source returns nullptr.
+TEST(BalanceTest, StealFromEmptyScheduler) {
+  RoundRobinScheduler source;
+
+  EXPECT_TRUE(source.IsEmpty());
+  EXPECT_EQ(source.PickNext(), nullptr);
+}
+
+/// @brief Test load imbalance detection across multiple schedulers.
+TEST(BalanceTest, FindMostLoadedCore) {
+  constexpr size_t kCoreCount = 4;
+  RoundRobinScheduler schedulers[kCoreCount];
+
+  // Core 0: 1 task, Core 1: 5 tasks, Core 2: 2 tasks, Core 3: 0 tasks
+  TaskControlBlock tasks[8] = {
+      {"T0", 1, nullptr, nullptr}, {"T1", 1, nullptr, nullptr},
+      {"T2", 1, nullptr, nullptr}, {"T3", 1, nullptr, nullptr},
+      {"T4", 1, nullptr, nullptr}, {"T5", 1, nullptr, nullptr},
+      {"T6", 1, nullptr, nullptr}, {"T7", 1, nullptr, nullptr},
+  };
+
+  schedulers[0].Enqueue(&tasks[0]);
+  schedulers[1].Enqueue(&tasks[1]);
+  schedulers[1].Enqueue(&tasks[2]);
+  schedulers[1].Enqueue(&tasks[3]);
+  schedulers[1].Enqueue(&tasks[4]);
+  schedulers[1].Enqueue(&tasks[5]);
+  schedulers[2].Enqueue(&tasks[6]);
+  schedulers[2].Enqueue(&tasks[7]);
+
+  // Find the most loaded core
+  size_t max_load = 0;
+  size_t max_core = 0;
+  for (size_t i = 0; i < kCoreCount; ++i) {
+    size_t load = schedulers[i].GetQueueSize();
+    if (load > max_load) {
+      max_load = load;
+      max_core = i;
+    }
+  }
+
+  EXPECT_EQ(max_core, 1);
+  EXPECT_EQ(max_load, 5);
+
+  // Core 3 is the least loaded (0 tasks) — it would be the stealer
+  size_t min_load = max_load;
+  size_t min_core = 0;
+  for (size_t i = 0; i < kCoreCount; ++i) {
+    size_t load = schedulers[i].GetQueueSize();
+    if (load < min_load) {
+      min_load = load;
+      min_core = i;
+    }
+  }
+
+  EXPECT_EQ(min_core, 3);
+  EXPECT_EQ(min_load, 0);
+
+  // Steal one task from core 1 to core 3
+  auto* stolen = schedulers[max_core].PickNext();
+  ASSERT_NE(stolen, nullptr);
+  schedulers[min_core].Enqueue(stolen);
+
+  EXPECT_EQ(schedulers[1].GetQueueSize(), 4);
+  EXPECT_EQ(schedulers[3].GetQueueSize(), 1);
+}