diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index c251ab69e..10edaea33 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -9,7 +9,7 @@ permissions:
   contents: write
 
 env:
-  RELEASE: Release v3.3.1
+  RELEASE: Release v3.3.2
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
   
 name: Release
@@ -19,7 +19,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        branch: [v1.9.9,v2.3.1,v3.3.1]  # [dev,dev2,dev3]
+        branch: [v1.9.10,v2.3.2,v3.3.2]  # [dev,dev2,dev3]
         # we build on the oldest ubuntu version for better binary compatibility.
         os: [windows-latest, macOS-latest, macos-15-intel, ubuntu-22.04, ubuntu-22.04-arm]  
 
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
new file mode 100644
index 000000000..dff9cb656
--- /dev/null
+++ b/.github/workflows/stale.yaml
@@ -0,0 +1,27 @@
+on:
+  workflow_dispatch:       # allow running the workflow manually   
+  schedule:
+    - cron: "15 21 * * *"  # minute, hour, day (1-31), month (1-12), day of the week (0 - 6 or SUN-SAT)
+
+name: Close inactive issues
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v10
+        with:
+          days-before-issue-stale: 360
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs in the next 14 days. Thank you for your contributions!"
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale. Please feel free to reopen if this is still an active issue."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          stale-pr-label: "stale"
+          stale-pr-message: "This PR has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs in the next 14 days. Thank you for your contributions!"
+          close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale. Please feel free to reopen if you think this PR should still be considered. Thank you again for your help."
+          operations-per-run: 32
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 4141e4cff..649eb71d0 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 2)
 set(mi_version_minor 3)
-set(mi_version_patch 1)
+set(mi_version_patch 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake
index 154a956f4..a1bce75d3 100644
--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@@ -3,7 +3,7 @@ vcpkg_from_github(
   REPO microsoft/mimalloc
   HEAD_REF master
 
-  # The "REF" can be a commit hash, branch name (dev3), or a version (v3.3.1).
+  # The "REF" can be a commit hash, branch name (dev3), or a version (v3.3.2).
   REF "v${VERSION}"
   # REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774
 
diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json
index 5e5e026e3..84ae68f32 100644
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "mimalloc",
-  "version": "3.3.0",
+  "version": "3.3.2",
   "port-version": 0,
   "description": "Compact general purpose allocator with excellent performance",
   "homepage": "https://github.com/microsoft/mimalloc",
diff --git a/doc/release-notes.md b/doc/release-notes.md
index 10052d572..b467c1468 100644
--- a/doc/release-notes.md
+++ b/doc/release-notes.md
@@ -10,6 +10,6 @@ Notes:
 - Generally it is recommended to download sources (or use `vcpkg` etc.) and build mimalloc as 
   part of your project.
 - Source releases can also be downloaded directly from github by the tag.  
-  For example <https://github.com/microsoft/mimalloc/archive/v3.3.0.tar.gz>.  
+  For example <https://github.com/microsoft/mimalloc/archive/v3.3.2.tar.gz>.  
 - Binary releases include a release-, debug-, and secure build.
 - Linux binaries are built on Ubuntu 22.
diff --git a/include/mimalloc.h b/include/mimalloc.h
index bf45033d3..b61349ce6 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 20301  // major + 2 digits minor + 2 digits patch
+#define MI_MALLOC_VERSION 20302  // major + 2 digits minor + 2 digits patch
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -380,7 +380,7 @@ typedef mi_heap_t  mi_theap_t;
 #define mi_theap_collect(hp,force)        mi_heap_collect(hp,force)
 #define mi_theap_malloc(hp,sz)            mi_heap_malloc(hp,sz)
 #define mi_theap_zalloc(hp,sz)            mi_heap_zalloc(hp,sz)
-#define mi_theap_calloc(hp,cnt,sz)        mi_heap_malloc(hp,cnt,sz)
+#define mi_theap_calloc(hp,cnt,sz)        mi_heap_calloc(hp,cnt,sz)
 #define mi_theap_malloc_small(hp,sz)      mi_heap_malloc_small(hp,sz)
 #define mi_theap_malloc_aligned(hp,sz,a)  mi_heap_malloc_aligned(hp,sz,a)
 #define mi_theap_realloc(hp,p,newsz)      mi_heap_realloc(hp,p,newsz)
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 917b7f670..5ac8da6f9 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -155,15 +155,17 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 #elif defined(_MSC_VER)
 
 // Deprecated: MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
-// It is recommended to always compile as C++ when using MSVC
+// It is recommended to always compile as C++ when using MSVC.
 
 #include <intrin.h>
 #ifdef _WIN64
-typedef LONG64   msc_intptr_t;
-#define MI_64(f) f##64
+typedef LONG64        msc_intptr_t;
+#define MI_MSC_64(f)  f##64
+#define MI_MSC_XX(f)  f##64
 #else
-typedef LONG     msc_intptr_t;
-#define MI_64(f) f
+typedef LONG          msc_intptr_t;
+#define MI_MSC_64(f)  f
+#define MI_MSC_XX(f)  f##32
 #endif
 
 typedef enum mi_memory_order_e {
@@ -177,23 +179,23 @@ typedef enum mi_memory_order_e {
 
 static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
   (void)(mo);
-  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+  return (uintptr_t)MI_MSC_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
 static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
   (void)(mo);
-  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+  return (uintptr_t)MI_MSC_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
 }
 static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
   (void)(mo);
-  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+  return (uintptr_t)MI_MSC_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
   (void)(mo);
-  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+  return (uintptr_t)MI_MSC_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
   (void)(mo1); (void)(mo2);
-  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  const uintptr_t read = (uintptr_t)MI_MSC_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
   if (read == *expected) {
     return true;
   }
@@ -207,68 +209,119 @@ static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p
 }
 static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
   (void)(mo);
-  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+  return (uintptr_t)MI_MSC_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
 static inline void mi_atomic_thread_fence(mi_memory_order mo) {
   (void)(mo);
   _Atomic(uintptr_t) x = 0;
   mi_atomic_exchange_explicit(&x, 1, mo);
 }
+
 static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
   (void)(mo);
-#if defined(_M_IX86) || defined(_M_X64)
-  return *p;
-#else
-  uintptr_t x = *p;
-  if (mo > mi_memory_order_relaxed) {
-    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
-  }
-  return x;
-#endif
+  // assert(mo<=mi_memory_order_acquire); // others are not used by mimalloc
+  #if defined(_M_IX86) || defined(_M_X64)
+    return (uintptr_t)MI_MSC_XX(__iso_volatile_load)((volatile const intptr_t*)p);
+  #elif defined(_M_ARM) || defined(_M_ARM64)
+    if (mo == mi_memory_order_relaxed) {
+      return (uintptr_t)MI_MSC_XX(__iso_volatile_load)((volatile const intptr_t*)p);
+    }
+    else if (mo <= mi_memory_order_acquire) {
+      return MI_MSC_XX(__ldar)((volatile const uintptr_t*)p);
+    }
+    else {
+      const uintptr_t u = (uintptr_t)MI_MSC_XX(__iso_volatile_load)((volatile const intptr_t*)p);
+      __dmb(15);  // _ARM(64)_BARRIER_SY
+      return u;
+    }
+  #else
+    #warning "define mi_atomic_load_explicit for MSVC C compilation on this platform (which should be readonly, see issue #1277)"
+    return MI_MSC_XX(__iso_volatile_load)((volatile const intptr_t*)p);
+  #endif
 }
 static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
   (void)(mo);
-#if defined(_M_IX86) || defined(_M_X64)
-  *p = x;
-#else
-  mi_atomic_exchange_explicit(p, x, mo);
-#endif
+  // assert(mo<=mi_memory_order_release); // others are not used by mimalloc
+  #if defined(_M_IX86) || defined(_M_X64)
+    MI_MSC_XX(__iso_volatile_store)((volatile intptr_t*)p, x);
+  #elif defined(_M_ARM) || defined(_M_ARM64)
+    if (mo == mi_memory_order_relaxed) {
+      MI_MSC_XX(__iso_volatile_store)((volatile intptr_t*)p, x);
+    }
+    else if (mo <= mi_memory_order_release) {
+      MI_MSC_XX(__stlr)((volatile uintptr_t*)p,x);
+    }
+    else {
+      mi_atomic_exchange_explicit(p, x, mo);
+    }
+  #else
+    mi_atomic_exchange_explicit(p, x, mo);
+  #endif
 }
+
 static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
   (void)(mo);
-#if defined(_M_X64)
-  return *p;
-#else
-  int64_t old = *p;
-  int64_t x = old;
-  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
-    x = old;
-  }
-  return x;
-#endif
+  // assert(mo<=mi_memory_order_acquire); // others are not used by mimalloc
+  #if defined(_M_IX86) || defined(_M_X64)
+    return __iso_volatile_load64((volatile const int64_t*)p);
+  #elif defined(_M_ARM) || defined(_M_ARM64)
+    if (mo == mi_memory_order_relaxed) {
+      return __iso_volatile_load64((volatile const int64_t*)p);
+    }
+    #if defined(_M_ARM64)
+    else if (mo <= mi_memory_order_acquire) {
+      return __ldar64((volatile const uintptr_t*)p);
+    }
+    #endif
+    else {
+      const int64_t i = __iso_volatile_load64((volatile const int64_t*)p);
+      __dmb(15);  // _ARM(64)_BARRIER_SY
+      return i;
+    }
+  #else
+    #warning "define mi_atomic_loadi64_explicit for MSVC C compilation on this platform (which should be readonly, see issue #1277)"
+    return __iso_volatile_load64((volatile const int64_t*)p);
+  #endif
 }
+
 static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
   (void)(mo);
-#if defined(_M_X64)
-  *p = x;
-#else
-  InterlockedExchange64(p, x);
-#endif
+  // assert(mo<=mi_memory_order_release); // others are not used by mimalloc
+  #if defined(_M_IX86) || defined(_M_X64)
+    __iso_volatile_store64((volatile int64_t*)p,x);
+  #elif defined(_M_ARM) || defined(_M_ARM64)
+    if (mo == mi_memory_order_relaxed) {
+      __iso_volatile_store64((volatile int64_t*)p,x);
+    }
+    #if defined(_M_ARM64)
+    else if (mo == mi_memory_order_release) {
+      __stlr64((volatile uint64_t*)p, (uint64_t)x);
+    }
+    #endif
+    else {
+      InterlockedExchange64(p, x);
+    }
+  #else
+    InterlockedExchange64(p, x);
+  #endif
 }
 
 // These are used by the statistics
 static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
-#ifdef _WIN64
-  return (int64_t)mi_atomic_addi((int64_t*)p, add);
-#else
-  int64_t current;
-  int64_t sum;
-  do {
-    current = *p;
-    sum = current + add;
-  } while (_InterlockedCompareExchange64(p, sum, current) != current);
-  return current;
-#endif
+  #ifdef _WIN64
+    return (int64_t)mi_atomic_addi((int64_t*)p, add);
+  #elif defined(_M_ARM)
+    return _InterlockedExchangeAdd64(p, add);
+  #else
+    // x86
+    int64_t current;
+    int64_t sum;
+    do {
+      current = __iso_volatile_load64((volatile const int64_t*)p);
+      sum = current + add;
+    } while (_InterlockedCompareExchange64(p, sum, current) != current);
+    return current;
+  #endif
 }
 static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
   const int64_t add = *padd;
@@ -289,7 +342,7 @@ static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t
 }
 
 static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
-  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  const int64_t read = _InterlockedCompareExchange64(p, des, *exp);
   if (read == *exp) {
     return true;
   }
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index fe3528fac..4553c0fb3 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -352,7 +352,7 @@ mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const
   Inlined definitions
 ----------------------------------------------------------- */
 #define MI_UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0)
+#if (MI_DEBUG>1)
 #define MI_UNUSED_RELEASE(x)
 #else
 #define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
@@ -378,8 +378,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {
 
 // Is a pointer aligned?
 static inline bool _mi_is_aligned(void* p, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  return (((uintptr_t)p % alignment) == 0);
+  return (alignment==0 || ((uintptr_t)p % alignment) == 0);
 }
 
 // Align upwards
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 82c4a9ab5..b963ad5dc 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -282,7 +282,9 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 #if defined(MI_PRIM_THREAD_ID)
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+  const mi_threadid_t tid = MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+  mi_assert_internal( (tid & 0x03) == 0 );        // mimalloc reserves the bottom 2 bits
+  return tid;
 }
 
 #elif defined(_WIN32)
diff --git a/readme.md b/readme.md
index 220de0544..bb788c307 100644
--- a/readme.md
+++ b/readme.md
@@ -15,9 +15,9 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release   : `v3.3.1` (2026-04-20) recommended.  
-Latest v2 release: `v2.3.1` (2026-04-20) stable.  
-Latest v1 release: `v1.9.9` (2026-04-20) legacy.
+Latest release   : `v3.3.2`  (2026-04-29) recommended.  
+Latest v2 release: `v2.3.2`  (2026-04-29) stable.  
+Latest v1 release: `v1.9.10` (2026-04-29) legacy.
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -88,6 +88,11 @@ New development is mostly on v3, while v1 and v2 are maintained with security an
 - __v1__: legacy version: initial design of mimalloc (release tags: `v1.9.x`, development branch `dev`). Send PR's against this version if possible.
 
 ### Releases
+* 2026-04-29, `v1.9.10`, `v2.3.2`, `v3.3.2`: various bug and security fixes through LLM audit (by @Zoxc). 
+  Only increase minimal purge size automatically if allow_thp is set to 2. Enable large OS alignment
+  on all platforms (fixing OS large pages on Windows). Fix accounting of committed memory on Linux/macOS.
+  Update MSVC atomics implementation when using C mode. Upstream Emscripten fixes. Proper atomic do-once
+  implementation. 
 * 2026-04-20, `v1.9.9`, `v2.3.1`, `v3.3.1`: various bug and security fixes. Special thanks to 
   @jinpzhanAMD, @res2k, and @GoldJohnKing for their help in improving Windows finalization, and 
   @Zoxc for his help in finding various issues.
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 245c8355b..e29a46fc9 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -33,8 +33,9 @@ static mi_decl_noinline mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi
     return NULL;
   }
   const size_t oversize = size + alignment - 1;
-  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
-  void* p = _mi_align_up_ptr(base, alignment);
+  void* const base = _mi_heap_malloc_guarded(heap, oversize, zero);
+  if (base==NULL) return NULL;
+  void* const p = _mi_align_up_ptr(base, alignment);
   mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
   mi_assert_internal(mi_usable_size(p) >= size);
   mi_assert_internal(_mi_is_aligned(p, alignment));
diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index 175bf15fb..60639ff49 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -61,14 +61,14 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
   void* q = mi_malloc_aligned(size, alignment);
   if (q==NULL && size != 0) return ENOMEM;
-  mi_assert_internal(((uintptr_t)q % alignment) == 0);
+  mi_assert_internal(_mi_is_aligned(q,alignment));
   *p = q;
   return 0;
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
   void* p = mi_malloc_aligned(size, alignment);
-  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  mi_assert_internal(_mi_is_aligned(p,alignment));
   return p;
 }
 
@@ -95,7 +95,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size
   */
   // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned
   void* p = mi_malloc_aligned(size, alignment);
-  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  mi_assert_internal(_mi_is_aligned(p,alignment));
   return p;
 }
 
diff --git a/src/alloc.c b/src/alloc.c
index 682557293..313efa5a1 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -708,23 +708,22 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
   mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, false /* don't zero */, 0 /* huge_alignment */, NULL);
   if (block==NULL) return NULL;
   void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+  if (p == NULL) return NULL;  
   if (zero) {
     _mi_memzero(p,obj_size);  // we have to zero afterwards as padding might have written inside the block (if the `blocksize > reqsize + os_page_size`)
   }
 
   // stats
-  mi_track_malloc(p, obj_size, zero);  
-  if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
-    #if MI_STAT>1
-    // adjust stats to only count the allocated size of the block (and not the guard page)
-    mi_heap_stat_adjust_decrease(heap, malloc_requested, req_size);
-    mi_heap_stat_increase(heap, malloc_requested, size);
-    #endif
-    _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1);
-  }
+  mi_track_malloc(p, obj_size, zero);   
+  if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+  _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1);
+  #if MI_STAT>1
+  // adjust stats to only count the allocated size of the block (and not the guard page)
+  mi_heap_stat_adjust_decrease(heap, malloc_requested, req_size);
+  mi_heap_stat_increase(heap, malloc_requested, size);
+  #endif
   #if MI_DEBUG>3
-  if (p != NULL && zero) {
+  if (zero) {
     mi_assert_expensive(mi_mem_is_zero(p, size));
   }
   #endif
diff --git a/src/arena.c b/src/arena.c
index b705d9f30..0ba4f9985 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -274,6 +274,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
       const size_t stat_commit_size = commit_size - mi_arena_block_size(already_committed);
       bool commit_zero = false;
       if (!_mi_os_commit_ex(p, commit_size, &commit_zero, stat_commit_size)) {
+        // set all as uncommitted on commit failure
+        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
         memid->initially_committed = false;
       }
       else {
@@ -392,7 +394,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *a
 
   // commit eagerly?
   bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
+  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit() || mi_option_is_enabled(mi_option_allow_large_os_pages); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
   return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
diff --git a/src/free.c b/src/free.c
index cd8396c32..8762ae1e8 100644
--- a/src/free.c
+++ b/src/free.c
@@ -61,12 +61,18 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
   size_t diff = (uint8_t*)p - page->page_start;
-  size_t adjust;
+  size_t adjust = 0;
   if mi_likely(page->block_size_shift != 0) {
     adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
   }
   else {
-    adjust = diff % mi_page_block_size(page);
+    const size_t block_size = page->block_size;
+    if mi_likely(block_size != 0) {
+      adjust = diff % block_size;
+    }
+    else {
+      _mi_error_message(EFAULT, "reading from invalid page, possibly corrupted meta-data (address=%p, page=%p)\n", p, page);      
+    }
   }
 
   return (mi_block_t*)((uintptr_t)p - adjust);
@@ -144,7 +150,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
     }
   }
   #endif
-  #if (MI_DEBUG>0 || MI_SECURE>=4)
+  #if (MI_DEBUG>0 || MI_SECURE>=3)
   if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
     _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
     return NULL;
diff --git a/src/heap.c b/src/heap.c
index e18479980..219fb8db4 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -589,7 +589,8 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
   mi_assert(page != NULL);
   if (page == NULL) return true;
 
-  _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
+  // collect early so the page used count is reported correctly
+  // _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
   mi_assert_internal(page->local_free == NULL);
   if (page->used == 0) return true;
 
@@ -700,9 +701,11 @@ typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_
 static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
   MI_UNUSED(heap);
   MI_UNUSED(pq);
+  if (page==NULL) return true;
   mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
   mi_heap_area_ex_t xarea;
-  xarea.page = page;
+  xarea.page = page;  
+  _mi_page_free_collect(page,true); // collect early so the page->used is accurate  
   _mi_heap_area_init(&xarea.area, page);
   return fun(heap, &xarea, arg);
 }
diff --git a/src/init.c b/src/init.c
index f1882108b..3cc32358b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -144,7 +144,9 @@ mi_decl_cache_align static const mi_tld_t tld_empty = {
 };
 
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  return _mi_prim_thread_id();
+  mi_threadid_t tid = _mi_prim_thread_id();
+  mi_assert_internal( (tid & 0x03) == 0 ); // mimalloc reserves the bottom 2 bits
+  return tid;
 }
 
 // the thread-local default heap for allocation
@@ -273,6 +275,7 @@ mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   if (subproc_id == NULL) return;
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return;
   // check if there are no abandoned segments still..
   bool safe_to_delete = false;
   mi_lock(&subproc->abandoned_os_lock) {
@@ -481,11 +484,10 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 
 // Set up handlers so `mi_thread_done` is called automatically
 static void mi_process_setup_auto_thread_done(void) {
-  static bool tls_initialized = false; // fine if it races
-  if (tls_initialized) return;
-  tls_initialized = true;
-  _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  mi_atomic_do_once {
+    _mi_prim_thread_init_auto_done();
+    _mi_heap_set_default_direct(&_mi_heap_main);
+  }
 }
 
 
diff --git a/src/options.c b/src/options.c
index 53803a451..b02e011b7 100644
--- a/src/options.c
+++ b/src/options.c
@@ -338,33 +338,39 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(size_t) out_len;
+static mi_lock_t out_buf_lock = MI_LOCK_INITIALIZER;
 
 static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
   MI_UNUSED(arg);
   if (msg==NULL) return;
-  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  if (mi_atomic_load_acquire(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
   size_t n = _mi_strlen(msg);
-  if (n==0) return;
-  // claim space
-  size_t start = mi_atomic_add_acq_rel(&out_len, n);
-  if (start >= MI_MAX_DELAY_OUTPUT) return;
-  // check bound
-  if (start+n >= MI_MAX_DELAY_OUTPUT) {
-    n = MI_MAX_DELAY_OUTPUT-start-1;
+  if (n==0 || n >= MI_MAX_DELAY_OUTPUT) return;
+  // copy msg into the buffer
+  mi_lock(&out_buf_lock) {
+    const size_t start = mi_atomic_add_acq_rel(&out_len, n);
+    if (start < MI_MAX_DELAY_OUTPUT) {
+      // check bound
+      if (start+n >= MI_MAX_DELAY_OUTPUT) {
+        n = MI_MAX_DELAY_OUTPUT-start-1;
+      }
+      _mi_memcpy(&out_buf[start], msg, n);
+    }
   }
-  _mi_memcpy(&out_buf[start], msg, n);
 }
 
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
   if (out==NULL) return;
   // claim (if `no_more_buf == true`, no more output will be added after this point)
-  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
-  // and output the current contents
-  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
-  out_buf[count] = 0;
-  out(out_buf,arg);
-  if (!no_more_buf) {
-    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+  mi_lock(&out_buf_lock) {
+    size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+    // and output the current contents
+    if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+    out_buf[count] = 0;
+    out(out_buf,arg);
+    if (!no_more_buf) {
+      out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+    }
   }
 }
 
@@ -382,28 +388,29 @@ static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) {
 // Default output handler
 // --------------------------------------------------------
 
-// Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
-// For now, don't register output from multiple threads.
-static mi_output_fun* volatile mi_out_default; // = NULL
+// The program should only install a single output handler from a single thread
+// since otherwise the argument and output function may not match.
+static _Atomic(void*) mi_out_default; // = // is `mi_output_fun*` (but some platforms don't support atomic function pointers)
 static _Atomic(void*) mi_out_arg; // = NULL
 
 static mi_output_fun* mi_out_get_default(void** parg) {
+  mi_output_fun* const out = (mi_output_fun*)mi_atomic_load_ptr_acquire(void,&mi_out_default);
   if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
-  mi_output_fun* out = mi_out_default;
   return (out == NULL ? &mi_out_buf : out);
 }
 
 void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  mi_atomic_store_ptr_release(void,&mi_out_default, (void*)(out == NULL ? &mi_out_stderr : out)); // stop using the delayed output buffer
   mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
-  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
+  if (out!=NULL) { mi_out_buf_flush(out,true,arg); }        // output all the delayed output now
 }
 
 // add stderr to the delayed output after the module is loaded
 static void mi_add_stderr_output(void) {
   mi_assert_internal(mi_out_default == NULL);
   mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
-  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+  mi_atomic_store_ptr_release(void,&mi_out_default,(void*)&mi_out_buf_stderr);  // and add stderr to the delayed output
+  mi_atomic_store_ptr_release(void,&mi_out_arg,NULL);
 }
 
 // --------------------------------------------------------
diff --git a/src/os.c b/src/os.c
index a34638cd8..b60080a7f 100644
--- a/src/os.c
+++ b/src/os.c
@@ -213,6 +213,13 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
   if (size == 0) return NULL;
   if (!commit) { allow_large = false; }
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
+  
+  // try to align along large OS page size for larger allocations
+  const size_t large_page_size = mi_os_mem_config.large_page_size;
+  if (large_page_size > 0 && hint_addr == NULL && size >= 8*large_page_size && _mi_is_power_of_two(try_alignment) && try_alignment < large_page_size) {
+    try_alignment = large_page_size; 
+  }
+
   *is_zero = false;
   void* p = NULL;
   int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p);
@@ -244,23 +251,26 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid) {
+  mi_assert_internal(memid!=NULL);
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
-  mi_assert_internal(is_large != NULL);
-  mi_assert_internal(is_zero != NULL);
-  mi_assert_internal(base != NULL);
+  _mi_memzero(memid,sizeof(*memid));
   if (!commit) allow_large = false;
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
   // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  bool os_is_large = false;
+  bool os_is_zero = false;
+  void* os_base = NULL;
+  size_t os_size = size;
+  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, &os_is_large, &os_is_zero);
   if (p == NULL) return NULL;
 
   // aligned already?
   if (((uintptr_t)p % alignment) == 0) {
-    *base = p;
+    os_base = p;
   }
   else {
     // if not aligned, free it, overallocate, and unmap around it
@@ -273,43 +283,47 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 
     if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
       // over-allocate uncommitted (virtual) memory
-      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero);
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, &os_is_large, &os_is_zero);
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
       // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
       // this is handled though by having the `base` field in the memid's
-      *base = p; // remember the base
+      os_base = p; // remember the base
+      os_size = over_size;
       p = _mi_align_up_ptr(p, alignment);
 
       // explicitly commit only the aligned part
       if (commit) {
         if (!_mi_os_commit(p, size, NULL)) {
-          mi_os_prim_free(*base, over_size, 0);
+          mi_os_prim_free(os_base, over_size, 0);
           return NULL;
         }
       }
     }
     else  { // mmap can free inside an allocation
       // overallocate...
-      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero);
+      p = mi_os_prim_alloc(over_size, 1, commit, false, &os_is_large, &os_is_zero);
       if (p == NULL) return NULL;
 
       // and selectively unmap parts around the over-allocated area.
-      void* aligned_p = _mi_align_up_ptr(p, alignment);
-      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
-      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
-      size_t post_size = over_size - pre_size - mid_size;
+      void* const aligned_p = _mi_align_up_ptr(p, alignment);
+      const size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+      const size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+      const size_t post_size = over_size - pre_size - mid_size;
       mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
       if (pre_size > 0)  { mi_os_prim_free(p, pre_size, (commit ? pre_size : 0)); }
       if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, (commit ? post_size : 0)); }
       // we can return the aligned pointer on `mmap` systems
       p = aligned_p;
-      *base = aligned_p; // since we freed the pre part, `*base == p`.
+      os_base = aligned_p; // since we freed the pre part, `*base == p`.
+      os_size = mid_size;
     }
   }
 
-  mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0));
+  mi_assert_internal(p != NULL && os_base != NULL && _mi_is_aligned(p,alignment));
+  mi_assert_internal(os_base <= p && size <= os_size);
+  *memid = _mi_memid_create_os(os_base,os_size,commit,os_is_zero,os_is_large);
   return p;
 }
 
@@ -341,16 +355,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
 
-  bool os_is_large = false;
-  bool os_is_zero  = false;
-  void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, memid );
   if (p == NULL) return NULL;
 
-  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
-  memid->mem.os.base = os_base;
-  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
-
   mi_assert_internal(memid->mem.os.size >= size);
   mi_assert_internal(_mi_is_aligned(p,alignment));
   if (commit) { mi_assert_internal(memid->initially_committed); }
@@ -394,7 +401,7 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
-  if (offset > MI_SEGMENT_SIZE) return NULL;
+  if (offset > MI_SEGMENT_SIZE || offset > size) return NULL;
   if (offset == 0) {
     // regular aligned allocation
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
@@ -402,6 +409,7 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
+    if (size >= SIZE_MAX - extra) return NULL;  // too large
     const size_t oversize = size + extra;
     void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid);
     if (start == NULL) return NULL;
@@ -409,7 +417,7 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
     void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
-    if (commit && extra > _mi_os_page_size()) {
+    if (commit && extra >= _mi_os_page_size()) {
       _mi_os_decommit(start, extra);
     }
     return p;
@@ -479,8 +487,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
 
 static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stat_size) {
   mi_assert_internal(needs_recommit!=NULL);
-  mi_os_stat_decrease(committed, stat_size);
-
+  
   // page align
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
@@ -492,6 +499,9 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, siz
   if (err != 0) {
     _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
+  else if (*needs_recommit) {   
+    mi_os_stat_decrease(committed, stat_size);
+  }
   mi_assert_internal(err == 0);
   return (err == 0);
 }
diff --git a/src/page-queue.c b/src/page-queue.c
index 068d11b2c..bfe9f4fdc 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -246,6 +246,10 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
 }
 
 
+#if MI_DEBUG >= 3
+static bool mi_page_queue_is_consistent(const mi_page_queue_t* queue);
+#endif
+
 static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
@@ -272,8 +276,35 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   // update direct
   mi_heap_queue_first_update(heap, queue);
   heap->page_count++;
+
+  // [specbot Q-NEW-1] first/last must be both null or both non-null (L1, O(1))
+  mi_assert_internal((queue->first == NULL) == (queue->last == NULL));
+  // [specbot Q-NEW-2] last element must have no next (L1, O(1))
+  mi_assert_internal(queue->last == NULL || queue->last->next == NULL);
+  mi_assert_expensive(mi_page_queue_is_consistent(queue));
 }
 
+#if MI_DEBUG >= 3
+// [specbot Q-NEW-3] Verify doubly-linked list forward/backward consistency (L2, O(n))
+static bool mi_page_queue_is_consistent(const mi_page_queue_t* queue) {
+  if (queue->first == NULL) return (queue->last == NULL);
+  if (queue->last == NULL) return false;
+  // forward: first -> ... -> last
+  const mi_page_t* p = queue->first;
+  const mi_page_t* prev = NULL;
+  size_t count = 0;
+  while (p != NULL) {
+    mi_assert_internal(p->prev == prev);
+    prev = p;
+    p = p->next;
+    count++;
+    if (count > 100000) return false; // cycle guard
+  }
+  mi_assert_internal(prev == queue->last);
+  return true;
+}
+#endif
+
 static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_queue_contains(queue, page));
diff --git a/src/page.c b/src/page.c
index 9dd4c4afd..9a8aac0e3 100644
--- a/src/page.c
+++ b/src/page.c
@@ -79,9 +79,17 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
 
 static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_block_size(page) > 0);
+  
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
+  // [specbot P-NEW-1] block_size_shift must be consistent with block_size (L1, O(1))
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
+  // [specbot P-NEW-2] capacity must be nonzero when blocks are in use (L1, O(1))
+  mi_assert_internal(page->used == 0 || page->capacity > 0);
+  // [specbot P-NEW-4] page_start must be non-null when capacity > 0 (L1, O(1))
+  mi_assert_internal(page->capacity == 0 || mi_page_start(page) != NULL);
+
   uint8_t* start = mi_page_start(page);
   mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL));
   mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE));
@@ -90,6 +98,18 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
   mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
 
+  // [specbot P-NEW-7] All free list blocks are aligned to block_size (L2, O(n))
+  {
+    size_t bsize = mi_page_block_size(page);
+    uint8_t* pstart = mi_page_start(page);
+    for (mi_block_t* b = page->free; b != NULL; b = mi_block_next(page, b)) {
+      mi_assert_internal(((uint8_t*)b - pstart) % bsize == 0);
+    }
+    for (mi_block_t* b = page->local_free; b != NULL; b = mi_block_next(page, b)) {
+      mi_assert_internal(((uint8_t*)b - pstart) % bsize == 0);
+    }
+  }
+
   #if MI_DEBUG>3 // generally too expensive to check this
   if (page->free_is_zero) {
     const size_t ubsize = mi_page_usable_block_size(page);
@@ -122,6 +142,9 @@ bool _mi_page_is_valid(mi_page_t* page) {
   if (mi_page_heap(page)!=NULL) {
     mi_segment_t* segment = _mi_page_segment(page);
 
+    // [specbot P-NEW-6] heap_tag must match owning heap's tag (L1, O(1))
+    mi_assert_internal(page->heap_tag == mi_page_heap(page)->tag);
+
     mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
     #if MI_HUGE_PAGE_ABANDON
     if (segment->kind != MI_SEGMENT_HUGE)
@@ -897,21 +920,24 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
   a certain number of allocations.
 ----------------------------------------------------------- */
 
-static mi_deferred_free_fun* volatile deferred_free = NULL;
-static _Atomic(void*) deferred_arg; // = NULL
+// The program should only install a single deferred free handler before doing allocation.
+static _Atomic(void*) deferred_free; // is `mi_deferred_free_fun*` (but some platforms don't support atomic function pointers)
+static _Atomic(void*) deferred_arg;   
 
 void _mi_deferred_free(mi_heap_t* heap, bool force) {
   heap->tld->heartbeat++;
-  if (deferred_free != NULL && !heap->tld->recurse) {
+  mi_deferred_free_fun* const fun = (mi_deferred_free_fun*)mi_atomic_load_ptr_acquire(void,&deferred_free);
+  if (fun != NULL && !heap->tld->recurse) {
     heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    void* const arg = mi_atomic_load_ptr_acquire(void,&deferred_arg);  
+    fun(force, heap->tld->heartbeat, arg);
     heap->tld->recurse = false;
   }
 }
 
 void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
-  deferred_free = fn;
   mi_atomic_store_ptr_release(void,&deferred_arg, arg);
+  mi_atomic_store_ptr_release(void,&deferred_free, (void*)fn);  
 }
 
 
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index c4cfc35dd..93c76757d 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -12,6 +12,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
+#include <unistd.h>  // getentropy
+
 // Design
 // ======
 //
@@ -223,7 +225,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // nothing to do
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index fd98a090d..3bffd2912 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -229,15 +229,17 @@ int _mi_prim_free(void* addr, size_t size ) {
 // mmap
 //---------------------------------------------
 
+// return errno on failure
 static int unix_madvise(void* addr, size_t size, int advice) {
   #if defined(__sun)
-  int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  const int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  return (res==0 ? 0 : errno);
   #elif defined(__QNX__)
-  int res = posix_madvise(addr, size, advice);
+  return posix_madvise(addr, size, advice);              // posix returns errno
   #else
-  int res = madvise(addr, size, advice);
-  #endif
+  const int res = madvise(addr, size, advice);           // linux returns -1 on failure and sets errno
   return (res==0 ? 0 : errno);
+  #endif
 }
 
 static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
@@ -417,10 +419,6 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
-  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
-    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
-  }
-
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
   *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large);
@@ -510,8 +508,8 @@ int _mi_prim_reset(void* start, size_t size) {
   // default `MADV_DONTNEED` is used though.
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
-  while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+  while ((err = unix_madvise(start, size, oadvice)) != 0 && err == EAGAIN) { /* try again */ };
+  if (err == EINVAL && oadvice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
     mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
     err = unix_madvise(start, size, MADV_DONTNEED);
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index cd3bbeca0..0d9bb2ac9 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -613,7 +613,7 @@ void _mi_prim_out_stderr( const char* msg )
 // Note: on windows, environment names are not case sensitive.
 bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
   result[0] = 0;
-  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  const size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
   return (len > 0 && len < result_size);
 }
 
diff --git a/src/random.c b/src/random.c
index 59983e937..d5d68e7f6 100644
--- a/src/random.c
+++ b/src/random.c
@@ -99,7 +99,8 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  ctx->output_available = 0;
+  _mi_memzero(ctx->output,sizeof(ctx->output));
   for (size_t i = 0; i < 4; i++) {
     const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
     ctx->input[i] = read32(sigma,i);
@@ -114,7 +115,8 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
+  ctx_new->weak = ctx->weak;
   _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
   ctx_new->input[12] = 0;
   ctx_new->input[13] = 0;
diff --git a/src/segment-map.c b/src/segment-map.c
index bbcea28aa..e3cbfc166 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -113,11 +113,16 @@ static mi_segment_t* _mi_segment_of(const void* p) {
   size_t bitidx;
   mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
   if (part == NULL) return NULL;  
-  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
+  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);  
   if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
-    return segment; // yes, allocated by us
+    // yes, allocated by us
+    const bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+    if mi_likely(cookie_ok) {
+      return segment; // yes, allocated by us and valid
+    }
+    else {
+      _mi_error_message(EFAULT, "segment map found an invalid segment, possibly corrupted meta-data (address=%p, segment=%p)\n", p, segment);
+    }
   }
   return NULL;
 }
diff --git a/src/segment.c b/src/segment.c
index f440dc01a..7ae84616f 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -275,13 +275,23 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->abandoned <= segment->used);
   mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // can only decommit committed blocks
+
+  // [specbot S-NEW-1] segment must have at least one slice (L1, O(1))
+  mi_assert_internal(segment->segment_slices > 0);
+  // [specbot S-NEW-2] info slices must leave room for at least one data slice (L1, O(1))
+  mi_assert_internal(segment->segment_info_slices < segment->segment_slices);
+  // [specbot S-NEW-3] slice_entries must not exceed array bounds (L1, O(1))
+  mi_assert_internal(segment->slice_entries <= MI_SLICES_PER_SEGMENT);
+
   //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
   mi_slice_t* slice = &segment->slices[0];
   const mi_slice_t* end = mi_segment_slices_end(segment);
   size_t used_count = 0;
+  size_t total_slice_count = 0;
   mi_span_queue_t* sq;
   while(slice < end) {
     mi_assert_internal(slice->slice_count > 0);
+    total_slice_count += slice->slice_count;
     mi_assert_internal(slice->slice_offset == 0);
     size_t index = mi_slice_index(slice);
     size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
@@ -316,6 +326,8 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
     slice = &segment->slices[maxindex+1];
   }
   mi_assert_internal(slice == end);
+  // [specbot S-NEW-5] Total slice counts must sum to segment_slices (L2, O(n))
+  mi_assert_internal(total_slice_count == segment->segment_slices);
   mi_assert_internal(used_count == segment->used + 1);
   return true;
 }
@@ -693,7 +705,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
   }
 
   // otherwise coalesce the span and add to the free span queues
-  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
+  const bool is_abandoned = mi_segment_is_abandoned(segment);
   size_t slice_count = slice->slice_count;
   mi_slice_t* next = slice + slice->slice_count;
   mi_assert_internal(next <= mi_segment_slices_end(segment));
@@ -1683,6 +1695,7 @@ static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_v
   _mi_heap_area_init(&area, page);
   if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false;
   if (visit_blocks) {
+    _mi_page_free_collect(page,true); // collect so the used count is accurate
     return _mi_heap_area_visit_blocks(&area, page, visitor, arg);
   }
   else {