diff --git a/config.yaml b/config.yaml
index 6d275ca..2e639ed 100644
--- a/config.yaml
+++ b/config.yaml
@@ -93,6 +93,17 @@ patches:
   lower: '6.17'
 - patch: 0003-x86-amd_node-fix-null-pointer-dereference-if-amd_smn.patch
   lower: '6.17'
+- patches:
+  - 0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch
+  - 0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch
+  - 0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch
+  - 0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch
+  - 0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch
+  - 0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch
+  - 0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch
+  - 0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch
+  - 0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch
+  lower: '6.18'
 images:
 - target: kernelsrc
   name: kernel-src
diff --git a/configs/x86_64/host.config b/configs/x86_64/host.config
index 3d28f1d..ce59dfc 100644
--- a/configs/x86_64/host.config
+++ b/configs/x86_64/host.config
@@ -4912,6 +4912,7 @@ CONFIG_XEN_BALLOON=y
 CONFIG_XEN_SCRUB_PAGES_DEFAULT=y
 CONFIG_XEN_DEV_EVTCHN=m
 CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BACKEND_NUMA_AFFINITY=y
 CONFIG_XENFS=y
 CONFIG_XEN_COMPAT_XENFS=y
 CONFIG_XEN_SYS_HYPERVISOR=y
@@ -5991,3 +5992,11 @@ CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y
 CONFIG_MHP_MEMMAP_ON_MEMORY=y
 CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
 CONFIG_XEN_MEMORY_HOTPLUG_LIMIT=512
+
+# Enable device memory hotplug support (dependency for Xen PV driver backend
+# NUMA binding)
+CONFIG_ZONE_DEVICE=y
+
+# NUMA balancing support
+CONFIG_NUMA_BALANCING=y
+CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
diff --git a/configs/x86_64/zone.config b/configs/x86_64/zone.config
index 7880eed..41606d5 100644
--- a/configs/x86_64/zone.config
+++ b/configs/x86_64/zone.config
@@ -1726,6 +1726,7 @@ CONFIG_XEN_MEMORY_HOTPLUG_LIMIT=512
 CONFIG_XEN_SCRUB_PAGES_DEFAULT=y
 CONFIG_XEN_DEV_EVTCHN=y
 CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BACKEND_NUMA_AFFINITY=y
 CONFIG_XENFS=y
 CONFIG_XEN_COMPAT_XENFS=y
 CONFIG_XEN_SYS_HYPERVISOR=y
@@ -2391,3 +2392,11 @@ CONFIG_DEBUG_INFO_NONE=n
 CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_BTF_MODULES=y
 CONFIG_DEBUG_INFO_COMPRESSED_ZSTD=y
+
+# Enable device memory hotplug support (dependency for Xen PV driver backend
+# NUMA binding)
+CONFIG_ZONE_DEVICE=y
+
+# NUMA balancing support
+CONFIG_NUMA_BALANCING=y
+CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
diff --git a/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch b/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch
new file mode 100644
index 0000000..e2b995e
--- /dev/null
+++ b/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch
@@ -0,0 +1,298 @@
+From bd2dbbb1f3a05a5d77d18548d41cdb25c0b81912 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 23:00:20 -0700
+Subject: [PATCH 1/9] xen/events: add _on_node variants of the lateeoi bind
+ helpers
+
+xen_allocate_irq_dynamic() unconditionally calls
+irq_alloc_desc_from(0, -1), so every Xen evtchn IRQ descriptor is
+allocated with NUMA_NO_NODE.  This means /proc/irq/N/node always
+reads -1 even when the caller (netback, blkback, netfront, blkfront)
+already knows the right node for the IRQ.
+
+irqbalance treats node=-1 as "no NUMA preference" and distributes
+the IRQ across all CPUs for load balance, ignoring affinity_hint.
+With irqbalance running, the per-queue NUMA placement we install via
+irq_set_affinity_and_hint() gets overwritten almost immediately.
+
+Add _on_node variants of the four bind helpers Xen front/back ends
+use:
+
+  bind_evtchn_to_irq_lateeoi_on_node(evtchn, node)
+  bind_evtchn_to_irqhandler_lateeoi_on_node(..., node)
+  bind_interdomain_evtchn_to_irq_lateeoi_on_node(..., node)
+  bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(..., node)
+
+Each passes the caller's node through to the internal chip helpers
+and on to a new xen_allocate_irq_dynamic_node(node).  The existing
+public functions become thin wrappers passing NUMA_NO_NODE, so every
+caller that hasn't been updated keeps today's behaviour.
+
+After this change, /proc/irq/N/node reflects the node the caller
+asked for, and irqbalance respects affinity_hint as a NUMA-local
+subset rather than treating the IRQ as floating.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/xen/events/events_base.c | 86 ++++++++++++++++++++++++++------
+ include/xen/events.h             | 15 ++++++
+ 2 files changed, 85 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
+index 9478fae014e5..6368ff561472 100644
+--- a/drivers/xen/events/events_base.c
++++ b/drivers/xen/events/events_base.c
+@@ -28,6 +28,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/irq.h>
+ #include <linux/moduleparam.h>
++#include <linux/numa.h>
+ #include <linux/string.h>
+ #include <linux/memblock.h>
+ #include <linux/slab.h>
+@@ -729,9 +730,9 @@ static struct irq_info *xen_irq_init(unsigned int irq)
+ 	return info;
+ }
+ 
+-static struct irq_info *xen_allocate_irq_dynamic(void)
++static struct irq_info *xen_allocate_irq_dynamic_node(int node)
+ {
+-	int irq = irq_alloc_desc_from(0, -1);
++	int irq = irq_alloc_desc_from(0, node);
+ 	struct irq_info *info = NULL;
+ 
+ 	if (irq >= 0) {
+@@ -743,6 +744,11 @@ static struct irq_info *xen_allocate_irq_dynamic(void)
+ 	return info;
+ }
+ 
++static struct irq_info *xen_allocate_irq_dynamic(void)
++{
++	return xen_allocate_irq_dynamic_node(NUMA_NO_NODE);
++}
++
+ static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi)
+ {
+ 	int irq;
+@@ -1184,7 +1190,8 @@ int xen_pirq_from_irq(unsigned irq)
+ EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
+ 
+ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
+-				   struct xenbus_device *dev, bool shared)
++				   struct xenbus_device *dev, bool shared,
++				   int node)
+ {
+ 	int ret = -ENOMEM;
+ 	struct irq_info *info;
+@@ -1197,7 +1204,7 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
+ 	info = evtchn_to_info(evtchn);
+ 
+ 	if (!info) {
+-		info = xen_allocate_irq_dynamic();
++		info = xen_allocate_irq_dynamic_node(node);
+ 		if (!info)
+ 			goto out;
+ 
+@@ -1232,16 +1239,25 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
+ 
+ int bind_evtchn_to_irq(evtchn_port_t evtchn)
+ {
+-	return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false);
++	return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false,
++				       NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+ 
+ int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
+ {
+-	return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false);
++	return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false,
++				       NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
+ 
++int bind_evtchn_to_irq_lateeoi_on_node(evtchn_port_t evtchn, int node)
++{
++	return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false,
++				       node);
++}
++EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi_on_node);
++
+ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ {
+ 	struct evtchn_bind_ipi bind_ipi;
+@@ -1291,7 +1307,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev,
+ 					       evtchn_port_t remote_port,
+ 					       struct irq_chip *chip,
+-					       bool shared)
++					       bool shared, int node)
+ {
+ 	struct evtchn_bind_interdomain bind_interdomain;
+ 	int err;
+@@ -1303,17 +1319,28 @@ static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev,
+ 					  &bind_interdomain);
+ 
+ 	return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port,
+-					       chip, dev, shared);
++					       chip, dev, shared, node);
+ }
+ 
+ int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev,
+ 					   evtchn_port_t remote_port)
+ {
+ 	return bind_interdomain_evtchn_to_irq_chip(dev, remote_port,
+-						   &xen_lateeoi_chip, false);
++						   &xen_lateeoi_chip, false,
++						   NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi);
+ 
++int bind_interdomain_evtchn_to_irq_lateeoi_on_node(struct xenbus_device *dev,
++						   evtchn_port_t remote_port,
++						   int node)
++{
++	return bind_interdomain_evtchn_to_irq_chip(dev, remote_port,
++						   &xen_lateeoi_chip, false,
++						   node);
++}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi_on_node);
++
+ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn,
+ 		     bool percpu)
+ {
+@@ -1432,12 +1459,12 @@ static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn,
+ 					  irq_handler_t handler,
+ 					  unsigned long irqflags,
+ 					  const char *devname, void *dev_id,
+-					  struct irq_chip *chip)
++					  struct irq_chip *chip, int node)
+ {
+ 	int irq, retval;
+ 
+ 	irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL,
+-				      irqflags & IRQF_SHARED);
++				      irqflags & IRQF_SHARED, node);
+ 	if (irq < 0)
+ 		return irq;
+ 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+@@ -1456,7 +1483,8 @@ int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
+ {
+ 	return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+ 					      devname, dev_id,
+-					      &xen_dynamic_chip);
++					      &xen_dynamic_chip,
++					      NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+ 
+@@ -1467,19 +1495,34 @@ int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn,
+ {
+ 	return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+ 					      devname, dev_id,
+-					      &xen_lateeoi_chip);
++					      &xen_lateeoi_chip,
++					      NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi);
+ 
++int bind_evtchn_to_irqhandler_lateeoi_on_node(evtchn_port_t evtchn,
++					      irq_handler_t handler,
++					      unsigned long irqflags,
++					      const char *devname,
++					      void *dev_id, int node)
++{
++	return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
++					      devname, dev_id,
++					      &xen_lateeoi_chip, node);
++}
++EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi_on_node);
++
+ static int bind_interdomain_evtchn_to_irqhandler_chip(
+ 		struct xenbus_device *dev, evtchn_port_t remote_port,
+ 		irq_handler_t handler, unsigned long irqflags,
+-		const char *devname, void *dev_id, struct irq_chip *chip)
++		const char *devname, void *dev_id, struct irq_chip *chip,
++		int node)
+ {
+ 	int irq, retval;
+ 
+ 	irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip,
+-						  irqflags & IRQF_SHARED);
++						  irqflags & IRQF_SHARED,
++						  node);
+ 	if (irq < 0)
+ 		return irq;
+ 
+@@ -1501,10 +1544,21 @@ int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev,
+ {
+ 	return bind_interdomain_evtchn_to_irqhandler_chip(dev,
+ 				remote_port, handler, irqflags, devname,
+-				dev_id, &xen_lateeoi_chip);
++				dev_id, &xen_lateeoi_chip, NUMA_NO_NODE);
+ }
+ EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi);
+ 
++int bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
++		struct xenbus_device *dev, evtchn_port_t remote_port,
++		irq_handler_t handler, unsigned long irqflags,
++		const char *devname, void *dev_id, int node)
++{
++	return bind_interdomain_evtchn_to_irqhandler_chip(dev,
++				remote_port, handler, irqflags, devname,
++				dev_id, &xen_lateeoi_chip, node);
++}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ 			    irq_handler_t handler,
+ 			    unsigned long irqflags, const char *devname, void *dev_id)
+diff --git a/include/xen/events.h b/include/xen/events.h
+index de5da58a0205..1abc068557b4 100644
+--- a/include/xen/events.h
++++ b/include/xen/events.h
+@@ -18,6 +18,7 @@ unsigned xen_evtchn_nr_channels(void);
+ 
+ int bind_evtchn_to_irq(evtchn_port_t evtchn);
+ int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn);
++int bind_evtchn_to_irq_lateeoi_on_node(evtchn_port_t evtchn, int node);
+ int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
+ 			      irq_handler_t handler,
+ 			      unsigned long irqflags, const char *devname,
+@@ -26,6 +27,10 @@ int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn,
+ 			      irq_handler_t handler,
+ 			      unsigned long irqflags, const char *devname,
+ 			      void *dev_id);
++int bind_evtchn_to_irqhandler_lateeoi_on_node(evtchn_port_t evtchn,
++			      irq_handler_t handler,
++			      unsigned long irqflags, const char *devname,
++			      void *dev_id, int node);
+ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu);
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ 			    irq_handler_t handler,
+@@ -39,12 +44,22 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ 			   void *dev_id);
+ int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev,
+ 					   evtchn_port_t remote_port);
++int bind_interdomain_evtchn_to_irq_lateeoi_on_node(struct xenbus_device *dev,
++						   evtchn_port_t remote_port,
++						   int node);
+ int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev,
+ 						  evtchn_port_t remote_port,
+ 						  irq_handler_t handler,
+ 						  unsigned long irqflags,
+ 						  const char *devname,
+ 						  void *dev_id);
++int bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
++					struct xenbus_device *dev,
++					evtchn_port_t remote_port,
++					irq_handler_t handler,
++					unsigned long irqflags,
++					const char *devname,
++					void *dev_id, int node);
+ 
+ /*
+  * Common unbind function for all event sources. Takes IRQ to unbind from.
+-- 
+2.54.0
+
diff --git a/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch b/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch
new file mode 100644
index 0000000..7482925
--- /dev/null
+++ b/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch
@@ -0,0 +1,348 @@
+From 0cc05d6a3795ccb1f832afde9920c74c12328368 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 18:35:49 -0700
+Subject: [PATCH 2/9] xen/xenbus: expose host NUMA node of a mapped ring
+
+Xenbus backends in PVH dom0 today have no visibility into which host
+NUMA node the foreign frame backing a grant-mapped ring lives on.
+xen_alloc_unpopulated_pages() registers its pool with NUMA_NO_NODE,
+so page_to_nid() of any grant-mapped page reports NUMA_NO_NODE and
+backends have nothing to drive kthread or IRQ placement off of.  The
+result is that every backend kthread for every queue piles onto one
+host node regardless of the guest's vNUMA layout.
+
+Add xenbus_ring_host_node(dev, vaddr) returning the Linux node id of
+the host node hosting the first ring page, or NUMA_NO_NODE when the
+information is unavailable.  The value is resolved once at map time
+using the new XENMEM_get_mfn_pxms hypercall and cached on the
+xenbus_map_node, so the public helper is a cheap list lookup.
+
+Three NUMA identifier namespaces are involved in this code path: host
+PXM (firmware), Xen-internal nid (assigned in SRAT scan order), and
+dom0 Linux node id.  The hypercall returns host PXM, matching what
+dom0's own SRAT already uses; pxm_to_node() converts to a Linux node
+id, which is what backends will feed to cpumask_of_node() and
+kthread_create_on_node().  Keeping the Xen-side ABI in PXM-space lets
+callers translate with one standard lookup instead of maintaining
+their own Xen-nid -> Linux-node table.
+
+Older or non-Edera hypervisors lack the new hypercall.  The first
+call there returns -ENOSYS, which latches a global "unsupported"
+flag; every subsequent xenbus_ring_host_node() returns NUMA_NO_NODE
+without issuing further hypercalls.  Backend patches that key off the
+helper see NUMA_NO_NODE and silently fall back to today's
+NUMA-oblivious behaviour.  This makes the kernel-side change safe to
+ship without a lockstep hypervisor update.
+
+Gated by CONFIG_XEN_BACKEND_NUMA_AFFINITY, which depends on
+XEN_BACKEND, NUMA, ACPI_NUMA, and XEN_UNPOPULATED_ALLOC and defaults
+to y.  XEN_UNPOPULATED_ALLOC is a hard dependency rather than a soft
+one because the per-ring placement work the rest of this series
+performs is only meaningful when grant-map placeholders come from the
+per-node pool registered via memremap_pages(pgmap, node).  Without
+XEN_UNPOPULATED_ALLOC, placeholders come from the generic balloon
+allocator and page_to_nid() reflects only where dom0's free RAM
+happened to be, not the host node of the foreign frame.  Per-ring
+NUMA placement under that configuration would commit to wrong nodes
+confidently rather than silently no-op.  Disabling the option
+compiles the query out and the public helper becomes a stub returning
+NUMA_NO_NODE.
+
+PV dom0 is not a NUMA-affinity target: the PV map path explicitly
+sets host_node = NUMA_NO_NODE and the helper handles the
+pv.area->addr lookup path symmetrically with hvm.addr, returning
+NUMA_NO_NODE in both cases when no match is found.
+
+The foreign MFN is sourced from dev_bus_addr in the gnttab map
+result.  Xen sets that field unconditionally for host_map operations
+(see grant_table.c in the hypervisor), so it is reliable in PVH dom0
+without IOMMU isolation.  Sampling only the first ring page's MFN is
+sufficient for the common case where a ring is contiguous on one
+host node; multi-node huge-page grants are a future concern.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/xen/Kconfig                |  25 +++++++
+ drivers/xen/xenbus/xenbus_client.c | 111 +++++++++++++++++++++++++++++
+ include/xen/interface/memory.h     |  26 +++++++
+ include/xen/xenbus.h               |  13 ++++
+ 4 files changed, 175 insertions(+)
+
+diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
+index f9a35ed266ec..147e6acb231b 100644
+--- a/drivers/xen/Kconfig
++++ b/drivers/xen/Kconfig
+@@ -96,6 +96,31 @@ config XEN_BACKEND
+ 	  Support for backend device drivers that provide I/O services
+ 	  to other virtual machines.
+ 
++config XEN_BACKEND_NUMA_AFFINITY
++	bool "NUMA affinity for Xen backend drivers"
++	depends on XEN_BACKEND && NUMA && ACPI_NUMA && XEN_UNPOPULATED_ALLOC
++	default y
++	help
++	  Allow Xen backend drivers (netback, blkback, gntdev consumers)
++	  to discover the host NUMA node that hosts a grant-mapped ring
++	  page, and to place their service threads and IRQs on that node.
++
++	  XEN_UNPOPULATED_ALLOC provides the per-node placeholder-page pool
++	  the relocation logic in xenbus_map_ring_valloc() draws from.
++	  Without it, placeholders come from the generic balloon allocator,
++	  whose page_to_nid() reflects only where dom0's free RAM happened
++	  to be -- not the host node of the foreign frame the placeholder
++	  will end up backing.  In that configuration the per-ring
++	  placement decisions would be confidently wrong rather than just
++	  absent, so the Kconfig hard-depends on XEN_UNPOPULATED_ALLOC
++	  rather than silently degrading.
++
++	  Requires hypervisor support for the XENMEM_get_mfn_pxms
++	  hypercall.  Without that support the feature is silently a
++	  no-op, equivalent to NUMA-oblivious behaviour.
++
++	  If unsure, say Y.
++
+ config XENFS
+ 	tristate "Xen filesystem"
+ 	select XEN_PRIVCMD
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index 2dc874fb5506..8e0695ba39a3 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -31,15 +31,18 @@
+  */
+ 
+ #include <linux/mm.h>
++#include <linux/numa.h>
+ #include <linux/slab.h>
+ #include <linux/types.h>
+ #include <linux/spinlock.h>
+ #include <linux/vmalloc.h>
+ #include <linux/export.h>
++#include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
+ #include <xen/page.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/event_channel.h>
++#include <xen/interface/memory.h>
+ #include <xen/balloon.h>
+ #include <xen/events.h>
+ #include <xen/grant_table.h>
+@@ -47,6 +50,10 @@
+ #include <xen/xen.h>
+ #include <xen/features.h>
+ 
++#ifdef CONFIG_XEN_BACKEND_NUMA_AFFINITY
++#include <acpi/acpi_numa.h>
++#endif
++
+ #include "xenbus.h"
+ 
+ #define XENBUS_PAGES(_grants)	(DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE))
+@@ -67,6 +74,7 @@ struct xenbus_map_node {
+ 	};
+ 	grant_handle_t handles[XENBUS_MAX_RING_GRANTS];
+ 	unsigned int   nr_handles;
++	int            host_node;	/* Linux node id of foreign frame, or NUMA_NO_NODE */
+ };
+ 
+ struct map_ring_valloc {
+@@ -85,6 +93,72 @@ struct map_ring_valloc {
+ static DEFINE_SPINLOCK(xenbus_valloc_lock);
+ static LIST_HEAD(xenbus_valloc_pages);
+ 
++#ifdef CONFIG_XEN_BACKEND_NUMA_AFFINITY
++/*
++ * Tri-state cache for XENMEM_get_mfn_pxms availability.  -ENOSYS from
++ * the first attempt latches "unsupported", short-circuiting future
++ * calls.  Any positive ACK (including the legitimate XEN_INVALID_NUMA_ID
++ * answer for an MFN Xen does not know about) latches "supported".
++ *
++ * Lock-free: at most one transition each direction, and "unsupported"
++ * is a stable terminal state once entered.  A racing reader might
++ * issue one redundant hypercall before observing the cached state,
++ * which is harmless.
++ */
++#define XEN_MFN_PXM_UNKNOWN     0
++#define XEN_MFN_PXM_SUPPORTED   1
++#define XEN_MFN_PXM_UNSUPPORTED 2
++
++static int xen_mfn_pxm_state = XEN_MFN_PXM_UNKNOWN;
++
++/*
++ * Resolve one foreign MFN to a Linux node id.  Returns NUMA_NO_NODE
++ * for any failure mode: hypercall unsupported, MFN unknown to Xen,
++ * PXM not registered with the dom0 ACPI namespace.
++ *
++ * Three NUMA identifier namespaces are involved here.  Xen returns
++ * host PXM (firmware-supplied).  pxm_to_node() translates to a Linux
++ * dom0 node id.  Callers then use the result against Linux helpers
++ * like cpumask_of_node() and kthread_create_on_node().
++ */
++static int xenbus_query_mfn_node(unsigned long mfn)
++{
++	struct xen_get_mfn_pxms req;
++	xen_pfn_t mfn_arg = mfn;
++	uint32_t pxm = XEN_INVALID_NUMA_ID;
++	int rc;
++
++	if (READ_ONCE(xen_mfn_pxm_state) == XEN_MFN_PXM_UNSUPPORTED)
++		return NUMA_NO_NODE;
++
++	memset(&req, 0, sizeof(req));
++	set_xen_guest_handle(req.mfns, &mfn_arg);
++	set_xen_guest_handle(req.pxms, &pxm);
++	req.nr_mfns = 1;
++
++	rc = HYPERVISOR_memory_op(XENMEM_get_mfn_pxms, &req);
++	if (rc < 0) {
++		if (rc == -ENOSYS) {
++			WRITE_ONCE(xen_mfn_pxm_state, XEN_MFN_PXM_UNSUPPORTED);
++			pr_info("xenbus: hypervisor lacks XENMEM_get_mfn_pxms, backend NUMA affinity disabled\n");
++		}
++		return NUMA_NO_NODE;
++	}
++
++	WRITE_ONCE(xen_mfn_pxm_state, XEN_MFN_PXM_SUPPORTED);
++
++	if (pxm == XEN_INVALID_NUMA_ID)
++		return NUMA_NO_NODE;
++
++	return pxm_to_node(pxm);
++}
++#else
++static int xenbus_query_mfn_node(unsigned long mfn)
++{
++	return NUMA_NO_NODE;
++}
++#endif /* CONFIG_XEN_BACKEND_NUMA_AFFINITY */
++
+ struct xenbus_ring_ops {
+ 	int (*map)(struct xenbus_device *dev, struct map_ring_valloc *info,
+ 		   grant_ref_t *gnt_refs, unsigned int nr_grefs,
+@@ -678,6 +752,8 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 	bool leaked = false;
+ 	unsigned int nr_pages = XENBUS_PAGES(nr_grefs);
+ 
++	node->host_node = NUMA_NO_NODE;
++
+ 	err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages);
+ 	if (err)
+ 		goto out_err;
+@@ -693,6 +769,17 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 	if (err)
+ 		goto out_free_ballooned_pages;
+ 
++	/*
++	 * Xen unconditionally fills dev_bus_addr with the foreign frame's
++	 * machine address on a successful host_map (see grant_table.c in
++	 * the hypervisor).  Pick up the first ring page's MFN and resolve
++	 * it now while we still have the map info; the result is cached on
++	 * the xenbus_map_node so backends can look it up cheaply later.
++	 */
++	if (nr_grefs > 0)
++		node->host_node = xenbus_query_mfn_node(
++			PFN_DOWN(info->map[0].dev_bus_addr));
++
+ 	addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP,
+ 		    PAGE_KERNEL);
+ 	if (!addr) {
+@@ -743,6 +830,27 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
+ }
+ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+ 
++int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr)
++{
++	struct xenbus_map_node *node;
++	int node_id = NUMA_NO_NODE;
++
++	spin_lock(&xenbus_valloc_lock);
++	list_for_each_entry(node, &xenbus_valloc_pages, next) {
++		void *addr = xen_pv_domain() ? node->pv.area->addr
++					     : node->hvm.addr;
++
++		if (addr == vaddr) {
++			node_id = node->host_node;
++			break;
++		}
++	}
++	spin_unlock(&xenbus_valloc_lock);
++
++	return node_id;
++}
++EXPORT_SYMBOL_GPL(xenbus_ring_host_node);
++
+ #ifdef CONFIG_XEN_PV
+ static int map_ring_apply(pte_t *pte, unsigned long addr, void *data)
+ {
+@@ -763,6 +871,9 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev,
+ 	bool leaked = false;
+ 	int err = -ENOMEM;
+ 
++	/* PV dom0 is not a NUMA-affinity target; leave the value unset. */
++	node->host_node = NUMA_NO_NODE;
++
+ 	area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP);
+ 	if (!area)
+ 		return -ENOMEM;
+diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
+index 1a371a825c55..1998a12a9465 100644
+--- a/include/xen/interface/memory.h
++++ b/include/xen/interface/memory.h
+@@ -325,4 +325,30 @@ struct xen_mem_acquire_resource {
+ };
+ DEFINE_GUEST_HANDLE_STRUCT(xen_mem_acquire_resource);
+ 
++/*
++ * XENMEM_get_mfn_pxms: resolve a batch of host MFNs to their firmware
++ * proximity-domain identifiers (host PXM on x86 ACPI).
++ *
++ * Returned values are in the host PXM namespace (the same value space
++ * dom0's own SRAT uses), not Xen's internal node id.  Callers convert
++ * to a Linux node id with pxm_to_node().  Slots Xen has no node info
++ * for receive XEN_INVALID_NUMA_ID rather than failing the whole batch.
++ *
++ * Restricted to the hardware domain.  On hypervisors that do not
++ * provide this op (older or non-Edera Xen, or builds without
++ * CONFIG_NUMA), the hypercall returns -ENOSYS; callers treat that as
++ * "feature unavailable" and fall back to NUMA-oblivious behaviour.
++ */
++#define XENMEM_get_mfn_pxms 40
++
++#define XEN_INVALID_NUMA_ID (~(uint32_t)0)
++
++struct xen_get_mfn_pxms {
++	GUEST_HANDLE(xen_pfn_t) mfns;
++	GUEST_HANDLE(uint32_t) pxms;
++	uint32_t nr_mfns;
++	uint32_t flags;
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_get_mfn_pxms);
++
+ #endif /* __XEN_PUBLIC_MEMORY_H__ */
+diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
+index 7dab04cf4a36..18b902bf79ef 100644
+--- a/include/xen/xenbus.h
++++ b/include/xen/xenbus.h
+@@ -225,6 +225,19 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs,
+ 
+ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+ 
++/*
++ * Return the host NUMA node (Linux node id) of the foreign frame
++ * backing the first page of a mapping previously established by
++ * xenbus_map_ring_valloc().  Returns NUMA_NO_NODE if the hypervisor
++ * cannot provide the information, the mapping is not found, or the
++ * kernel was built without CONFIG_XEN_BACKEND_NUMA_AFFINITY.
++ *
++ * Intended for backends placing their service threads and IRQs on
++ * the node hosting the ring.  The value is resolved at map time and
++ * cached on the mapping, so this call is a cheap lookup.
++ */
++int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr);
++
+ int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port);
+ int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port);
+ 
+-- 
+2.54.0
+
diff --git a/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch b/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch
new file mode 100644
index 0000000..f5ceaeb
--- /dev/null
+++ b/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch
@@ -0,0 +1,258 @@
+From d9b521a2e9bba0940078bb2a8d8052ef6871e57f Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 18:40:35 -0700
+Subject: [PATCH 3/9] xen-netback: place per-queue kthreads and IRQs near the
+ ring
+
+Today both xenvif kthreads (guest-rx and dealloc) and the per-queue
+event-channel IRQs run wherever the scheduler happened to place them,
+which in PVH dom0 is typically all on one CPU regardless of the
+guest's vNUMA layout.  When a guest's per-queue ring lives on a
+different host node from the kthread, every packet pays cross-node
+interconnect cost to walk the ring and to grant-copy payload pages.
+
+Use xenbus_ring_host_node() to find the host node hosting the
+queue's tx ring, then:
+
+ - Create the guest-rx and dealloc kthreads with
+   kthread_create_on_node() so the task_struct (and kernel stack) is
+   allocated on the target node from the start.  A bare
+   set_cpus_allowed_ptr() after kthread_run leaves the stack on the
+   caller's node and continues to pay cross-node cost on every
+   context switch.
+ - Pin each kthread to the node's cpumask with
+   set_cpus_allowed_ptr().  This is a hard pin; an operator can
+   still override with taskset.  No-op when no node info is
+   available.
+ - Bind each evtchn IRQ on the ring's node using
+   bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node() (and the
+   non-handler variant for the ctrl ring), so the underlying desc is
+   allocated with the right node attribute and irqbalance treats the
+   IRQ as NUMA-local rather than floating.  For the ctrl ring this
+   means resolving host_node before the bind instead of after.
+ - Steer the tx, rx, and ctrl IRQs with irq_set_affinity_and_hint().
+   This writes both the actual affinity and the affinity_hint, so a
+   fresh boot without irqbalance routes IRQs correctly and irqbalance
+   agrees if it is running.  Operator writes to
+   /proc/irq/N/smp_affinity still win.
+ - Clear the hint with irq_update_affinity_hint(irq, NULL) right
+   before each unbind_from_irqhandler() in xenvif_disconnect_queue
+   and xenvif_disconnect_ctrl.  free_irq() warns if the hint is still
+   set at teardown (kernel/irq/manage.c:1865); destroying a domU
+   would otherwise WARN in xenwatch context.
+
+When xenbus_ring_host_node() returns NUMA_NO_NODE (older Xen without
+the underlying hypercall, kernel built without
+CONFIG_XEN_BACKEND_NUMA_AFFINITY, or a PV mapping), every NUMA-aware
+step is skipped and the behaviour is byte-for-byte identical to the
+previous code path.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/net/xen-netback/interface.c | 102 ++++++++++++++++++++++------
+ 1 file changed, 82 insertions(+), 20 deletions(-)
+
+diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
+index a0a438881388..798f77595529 100644
+--- a/drivers/net/xen-netback/interface.c
++++ b/drivers/net/xen-netback/interface.c
+@@ -31,6 +31,7 @@
+ #include "common.h"
+ 
+ #include <linux/kthread.h>
++#include <linux/numa.h>
+ #include <linux/sched/task.h>
+ #include <linux/ethtool.h>
+ #include <linux/rtnetlink.h>
+@@ -638,19 +639,41 @@ int xenvif_connect_ctrl(struct xenvif *vif, grant_ref_t ring_ref,
+ 	if (req_prod - rsp_prod > RING_SIZE(&vif->ctrl))
+ 		goto err_unmap;
+ 
+-	err = bind_interdomain_evtchn_to_irq_lateeoi(xendev, evtchn);
+-	if (err < 0)
+-		goto err_unmap;
++	{
++		/*
++		 * Resolve the host node before binding the IRQ so the
++		 * desc itself is allocated on the right node (which is
++		 * what irqbalance reads from /proc/irq/N/node when
++		 * deciding affinity).  Steer the threaded IRQ handler
++		 * toward the same node with irq_set_affinity_and_hint so
++		 * a fresh boot without irqbalance also routes correctly.
++		 * Operator writes to /proc/irq/N/smp_affinity still win.
++		 */
++		int node = xenbus_ring_host_node(xendev, vif->ctrl.sring);
+ 
+-	vif->ctrl_irq = err;
++		err = bind_interdomain_evtchn_to_irq_lateeoi_on_node(xendev,
++								     evtchn,
++								     node);
++		if (err < 0)
++			goto err_unmap;
+ 
+-	xenvif_init_hash(vif);
++		vif->ctrl_irq = err;
+ 
+-	err = request_threaded_irq(vif->ctrl_irq, NULL, xenvif_ctrl_irq_fn,
+-				   IRQF_ONESHOT, "xen-netback-ctrl", vif);
+-	if (err) {
+-		pr_warn("Could not setup irq handler for %s\n", dev->name);
+-		goto err_deinit;
++		xenvif_init_hash(vif);
++
++		err = request_threaded_irq(vif->ctrl_irq, NULL,
++					   xenvif_ctrl_irq_fn,
++					   IRQF_ONESHOT,
++					   "xen-netback-ctrl", vif);
++		if (err) {
++			pr_warn("Could not setup irq handler for %s\n",
++				dev->name);
++			goto err_deinit;
++		}
++
++		if (node != NUMA_NO_NODE)
++			irq_set_affinity_and_hint(vif->ctrl_irq,
++						  cpumask_of_node(node));
+ 	}
+ 
+ 	return 0;
+@@ -686,6 +709,11 @@ static void xenvif_disconnect_queue(struct xenvif_queue *queue)
+ 	}
+ 
+ 	if (queue->tx_irq) {
++		/*
++		 * free_irq() warns if affinity_hint is still set.  Drop the
++		 * hint installed at connect time before tearing the IRQ down.
++		 */
++		irq_update_affinity_hint(queue->tx_irq, NULL);
+ 		unbind_from_irqhandler(queue->tx_irq, queue);
+ 		if (queue->tx_irq == queue->rx_irq)
+ 			queue->rx_irq = 0;
+@@ -693,6 +721,7 @@ static void xenvif_disconnect_queue(struct xenvif_queue *queue)
+ 	}
+ 
+ 	if (queue->rx_irq) {
++		irq_update_affinity_hint(queue->rx_irq, NULL);
+ 		unbind_from_irqhandler(queue->rx_irq, queue);
+ 		queue->rx_irq = 0;
+ 	}
+@@ -708,6 +737,7 @@ int xenvif_connect_data(struct xenvif_queue *queue,
+ {
+ 	struct xenbus_device *dev = xenvif_to_xenbus_device(queue->vif);
+ 	struct task_struct *task;
++	int ring_node;
+ 	int err;
+ 
+ 	BUG_ON(queue->tx_irq);
+@@ -719,6 +749,16 @@ int xenvif_connect_data(struct xenvif_queue *queue,
+ 	if (err < 0)
+ 		goto err;
+ 
++	/*
++	 * Place the per-queue kthreads and IRQs on the host node hosting
++	 * the ring page.  Most of the per-packet work touches the ring;
++	 * keeping the worker local cuts cross-node interconnect traffic.
++	 * Returns NUMA_NO_NODE on hypervisors without XENMEM_get_mfn_pxms
++	 * or on a kernel built without CONFIG_XEN_BACKEND_NUMA_AFFINITY,
++	 * in which case the code below falls back to today's behaviour.
++	 */
++	ring_node = xenbus_ring_host_node(dev, queue->tx.sring);
++
+ 	init_waitqueue_head(&queue->wq);
+ 	init_waitqueue_head(&queue->dealloc_wq);
+ 	atomic_set(&queue->inflight_packets, 0);
+@@ -727,8 +767,14 @@ int xenvif_connect_data(struct xenvif_queue *queue,
+ 
+ 	queue->stalled = true;
+ 
+-	task = kthread_run(xenvif_kthread_guest_rx, queue,
+-			   "%s-guest-rx", queue->name);
++	/*
++	 * Split kthread create + wake so the task_struct (and its kernel
++	 * stack) is allocated on the target node from the start.  A bare
++	 * set_cpus_allowed_ptr after kthread_run leaves the stack on the
++	 * caller's node.
++	 */
++	task = kthread_create_on_node(xenvif_kthread_guest_rx, queue,
++				      ring_node, "%s-guest-rx", queue->name);
+ 	if (IS_ERR(task))
+ 		goto kthread_err;
+ 	queue->task = task;
+@@ -737,43 +783,58 @@ int xenvif_connect_data(struct xenvif_queue *queue,
+ 	 * if the thread function returns before kthread_stop is called.
+ 	 */
+ 	get_task_struct(task);
++	if (ring_node != NUMA_NO_NODE)
++		set_cpus_allowed_ptr(task, cpumask_of_node(ring_node));
++	wake_up_process(task);
+ 
+-	task = kthread_run(xenvif_dealloc_kthread, queue,
+-			   "%s-dealloc", queue->name);
++	task = kthread_create_on_node(xenvif_dealloc_kthread, queue,
++				      ring_node, "%s-dealloc", queue->name);
+ 	if (IS_ERR(task))
+ 		goto kthread_err;
+ 	queue->dealloc_task = task;
++	if (ring_node != NUMA_NO_NODE)
++		set_cpus_allowed_ptr(task, cpumask_of_node(ring_node));
++	wake_up_process(task);
+ 
+ 	if (tx_evtchn == rx_evtchn) {
+ 		/* feature-split-event-channels == 0 */
+-		err = bind_interdomain_evtchn_to_irqhandler_lateeoi(
++		err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
+ 			dev, tx_evtchn, xenvif_interrupt, 0,
+-			queue->name, queue);
++			queue->name, queue, ring_node);
+ 		if (err < 0)
+ 			goto err;
+ 		queue->tx_irq = queue->rx_irq = err;
+ 		disable_irq(queue->tx_irq);
++		if (ring_node != NUMA_NO_NODE)
++			irq_set_affinity_and_hint(queue->tx_irq,
++						  cpumask_of_node(ring_node));
+ 	} else {
+ 		/* feature-split-event-channels == 1 */
+ 		snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name),
+ 			 "%s-tx", queue->name);
+-		err = bind_interdomain_evtchn_to_irqhandler_lateeoi(
++		err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
+ 			dev, tx_evtchn, xenvif_tx_interrupt, 0,
+-			queue->tx_irq_name, queue);
++			queue->tx_irq_name, queue, ring_node);
+ 		if (err < 0)
+ 			goto err;
+ 		queue->tx_irq = err;
+ 		disable_irq(queue->tx_irq);
++		if (ring_node != NUMA_NO_NODE)
++			irq_set_affinity_and_hint(queue->tx_irq,
++						  cpumask_of_node(ring_node));
+ 
+ 		snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name),
+ 			 "%s-rx", queue->name);
+-		err = bind_interdomain_evtchn_to_irqhandler_lateeoi(
++		err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
+ 			dev, rx_evtchn, xenvif_rx_interrupt, 0,
+-			queue->rx_irq_name, queue);
++			queue->rx_irq_name, queue, ring_node);
+ 		if (err < 0)
+ 			goto err;
+ 		queue->rx_irq = err;
+ 		disable_irq(queue->rx_irq);
++		if (ring_node != NUMA_NO_NODE)
++			irq_set_affinity_and_hint(queue->rx_irq,
++						  cpumask_of_node(ring_node));
+ 	}
+ 
+ 	return 0;
+@@ -820,6 +881,7 @@ void xenvif_disconnect_ctrl(struct xenvif *vif)
+ {
+ 	if (vif->ctrl_irq) {
+ 		xenvif_deinit_hash(vif);
++		irq_update_affinity_hint(vif->ctrl_irq, NULL);
+ 		unbind_from_irqhandler(vif->ctrl_irq, vif);
+ 		vif->ctrl_irq = 0;
+ 	}
+-- 
+2.54.0
+
diff --git a/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch b/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch
new file mode 100644
index 0000000..38ddf46
--- /dev/null
+++ b/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch
@@ -0,0 +1,182 @@
+From dfc8f5b06ce2855798e26f1221a7c176b6ae9cd1 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 18:50:03 -0700
+Subject: [PATCH 4/9] xen-blkback: place per-ring kthread and IRQ near the ring
+
+The xenblkd kthread and per-ring event-channel IRQ today run wherever
+the scheduler happens to place them, which in PVH dom0 is typically
+all on one CPU regardless of the guest's vNUMA layout.  When the
+guest's per-ring buffer lives on a different host node from the
+worker, every request pays cross-node interconnect cost to walk the
+ring and to grant-copy payload pages.
+
+The blkback flow is split across two call sites: xen_blkif_map() maps
+the ring and binds the event channel, while xen_update_blkif_status()
+spawns the xenblkd kthread later.  Stash the ring's host node on the
+xen_blkif_ring at map time so the kthread creation site can pick it
+up without re-querying.
+
+At map time:
+ - Call xenbus_ring_host_node() right after xenbus_map_ring_valloc()
+   succeeds.  Store the result in ring->host_node.
+ - Bind the IRQ on the ring's node using
+   bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node() so the
+   underlying desc is allocated with the right node attribute and
+   irqbalance treats the IRQ as NUMA-local.
+ - Steer the bound IRQ with irq_set_affinity_and_hint() so a fresh
+   boot without irqbalance routes correctly and irqbalance agrees if
+   it is running.  Operator writes still win.
+
+At kthread creation time:
+ - Use kthread_create_on_node() with ring->host_node so the
+   task_struct (and kernel stack) is allocated on the target node
+   from the start.  A bare set_cpus_allowed_ptr() after kthread_run
+   would leave the stack on the caller's node and pay cross-node
+   cost on every context switch.
+ - Hard-pin the kthread to the node's cpumask with
+   set_cpus_allowed_ptr().  An operator can still override with
+   taskset.
+
+At disconnect time:
+ - Clear the hint with irq_update_affinity_hint(ring->irq, NULL)
+   right before each unbind_from_irqhandler().  free_irq() warns if
+   the hint is still set at teardown (kernel/irq/manage.c:1865);
+   destroying a domU would otherwise WARN in xenwatch context.
+
+ring->host_node is initialised to NUMA_NO_NODE in xen_blkif_alloc_rings
+so any path that misses the map step (or runs on a hypervisor without
+XENMEM_get_mfn_pxms, or a kernel without CONFIG_XEN_BACKEND_NUMA_AFFINITY)
+falls back to today's NUMA-oblivious behaviour without further checks.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/block/xen-blkback/common.h |  7 +++++
+ drivers/block/xen-blkback/xenbus.c | 48 ++++++++++++++++++++++++++++--
+ 2 files changed, 52 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
+index b427d54bc120..d670536ebb9e 100644
+--- a/drivers/block/xen-blkback/common.h
++++ b/drivers/block/xen-blkback/common.h
+@@ -297,6 +297,13 @@ struct xen_blkif_ring {
+ 	/* Thread shutdown wait queue. */
+ 	wait_queue_head_t	shutdown_wq;
+ 	struct xen_blkif	*blkif;
++
++	/*
++	 * Linux node id of the host NUMA node hosting blk_ring, or
++	 * NUMA_NO_NODE if unknown.  Resolved at xen_blkif_map() time,
++	 * consumed when starting the per-ring xenblkd kthread.
++	 */
++	int			host_node;
+ };
+ 
+ struct xen_blkif {
+diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
+index 0621878940ae..7ded88e97472 100644
+--- a/drivers/block/xen-blkback/xenbus.c
++++ b/drivers/block/xen-blkback/xenbus.c
+@@ -10,6 +10,7 @@
+ 
+ #include <linux/module.h>
+ #include <linux/kthread.h>
++#include <linux/numa.h>
+ #include <linux/pagemap.h>
+ #include <xen/events.h>
+ #include <xen/grant_table.h>
+@@ -108,7 +109,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
+ 
+ 	for (i = 0; i < blkif->nr_rings; i++) {
+ 		ring = &blkif->rings[i];
+-		ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
++		/*
++		 * Allocate the task_struct (including its kernel stack) on
++		 * the node hosting the ring so that subsequent kernel-mode
++		 * accesses on this thread stay local.  set_cpus_allowed_ptr
++		 * after kthread_create_on_node hard-pins to that node's CPUs;
++		 * an operator may still override with taskset.  NUMA_NO_NODE
++		 * leaves placement to the default scheduler.
++		 */
++		ring->xenblkd = kthread_create_on_node(xen_blkif_schedule, ring,
++						       ring->host_node,
++						       "%s-%d", name, i);
+ 		if (IS_ERR(ring->xenblkd)) {
+ 			err = PTR_ERR(ring->xenblkd);
+ 			ring->xenblkd = NULL;
+@@ -116,6 +127,10 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
+ 					"start %s-%d xenblkd", name, i);
+ 			goto out;
+ 		}
++		if (ring->host_node != NUMA_NO_NODE)
++			set_cpus_allowed_ptr(ring->xenblkd,
++					     cpumask_of_node(ring->host_node));
++		wake_up_process(ring->xenblkd);
+ 	}
+ 	return;
+ 
+@@ -150,6 +165,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+ 		init_waitqueue_head(&ring->pending_free_wq);
+ 		init_waitqueue_head(&ring->shutdown_wq);
+ 		ring->blkif = blkif;
++		ring->host_node = NUMA_NO_NODE;
+ 		ring->st_print = jiffies;
+ 		ring->active = true;
+ 	}
+@@ -207,6 +223,15 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
+ 	if (err < 0)
+ 		return err;
+ 
++	/*
++	 * Stash the host node now while the mapping is fresh.  The
++	 * xenblkd kthread is created later from xen_update_blkif_status()
++	 * and consumes this value to place the worker on the node that
++	 * owns the ring.  NUMA_NO_NODE leaves placement to the default
++	 * scheduler.
++	 */
++	ring->host_node = xenbus_ring_host_node(blkif->be->dev, ring->blk_ring);
++
+ 	sring_common = (struct blkif_common_sring *)ring->blk_ring;
+ 	rsp_prod = READ_ONCE(sring_common->rsp_prod);
+ 	req_prod = READ_ONCE(sring_common->req_prod);
+@@ -250,12 +275,23 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
+ 	if (req_prod - rsp_prod > size)
+ 		goto fail;
+ 
+-	err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->be->dev,
+-			evtchn, xen_blkif_be_int, 0, "blkif-backend", ring);
++	err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(
++			blkif->be->dev, evtchn, xen_blkif_be_int, 0,
++			"blkif-backend", ring, ring->host_node);
+ 	if (err < 0)
+ 		goto fail;
+ 	ring->irq = err;
+ 
++	/*
++	 * Route the event channel toward the ring's host node.  Writes
++	 * both the actual affinity (relied on at boot when irqbalance is
++	 * absent) and the hint (so irqbalance agrees if it is running).
++	 * Operator writes to /proc/irq/N/smp_affinity still win.
++	 */
++	if (ring->host_node != NUMA_NO_NODE)
++		irq_set_affinity_and_hint(ring->irq,
++					  cpumask_of_node(ring->host_node));
++
+ 	return 0;
+ 
+ fail:
+@@ -293,6 +329,12 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
+ 		}
+ 
+ 		if (ring->irq) {
++			/*
++			 * free_irq() warns if affinity_hint is still set.
++			 * Drop the hint installed at map time before tearing
++			 * the IRQ down.
++			 */
++			irq_update_affinity_hint(ring->irq, NULL);
+ 			unbind_from_irqhandler(ring->irq, ring);
+ 			ring->irq = 0;
+ 		}
+-- 
+2.54.0
+
diff --git a/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch b/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch
new file mode 100644
index 0000000..d179c44
--- /dev/null
+++ b/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch
@@ -0,0 +1,358 @@
+From aa8e71b14c040931222a36998a789e4ce3484c70 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 18:56:34 -0700
+Subject: [PATCH 5/9] xen: make xen_alloc_unpopulated_pages NUMA-aware
+
+xen_alloc_unpopulated_pages today hands out ZONE_DEVICE placeholder
+pages from a single global free list, with the underlying section
+registered via memremap_pages(pgmap, NUMA_NO_NODE).  page_to_nid()
+of any such page reports NUMA_NO_NODE, so every consumer of
+foreign-mapped grant pages -- xenbus rings, gntdev mmaps, xlate_mmu,
+privcmd, xen-drm -- has no useful node information for kernel code
+that consults page_to_nid (slab placement decisions, autonuma,
+numastat, various softirq routing heuristics).
+
+Partition the free list by Linux node id and register each
+section-aligned IOMEM allocation against a specific node.  Pages
+drawn from page_list[N] then report page_to_nid() == N by
+construction.  Callers that know the host node of the foreign frame
+they intend to grant-map can request from the matching pool; the
+foreign frame's node and the placeholder page's nid then agree.
+
+Two exported entry points:
+
+  xen_alloc_unpopulated_pages_node(nr, pages, node) -- new, takes an
+    explicit node preference.  NUMA_NO_NODE and out-of-range values
+    are clamped to numa_node_id() at the boundary; internal code only
+    ever indexes into a valid slot.
+
+  xen_alloc_unpopulated_pages(nr, pages) -- existing API, now a
+    wrapper that passes numa_node_id().  Strictly an improvement over
+    NUMA_NO_NODE for non-xenbus callers (gntdev, xlate_mmu, privcmd,
+    drm) that have no node hint of their own: the caller's local node
+    is at least as good a guess as none.
+
+xen_free_unpopulated_pages routes each page back to the pool of the
+node it came from via page_to_nid().  No new arguments; existing
+callers do not change.
+
+A single mutex still covers all per-node lists.  Allocation and free
+are connect-time events on I/O backends and on map_grant_ref ioctls,
+not contention-sensitive paths.
+
+MAX_NUMNODES sizes the two arrays; on CONFIG_NUMA=n that is 1 and the
+behaviour is byte-for-byte equivalent to the previous single-list
+implementation.
+
+Wire xenbus_map_ring_hvm() through the new API.  Placeholders are
+first drawn from numa_node_id() because the foreign MFN is not known
+before the grant_map.  After grant_map succeeds and the first page's
+host node is known via XENMEM_get_mfn_pxms, the placeholders are
+relocated: unmap, free, re-allocate on the target node, re-map.  When
+the initial allocation already landed on the right node the relocate
+is skipped.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/xen/unpopulated-alloc.c    | 103 +++++++++++++++++++++++------
+ drivers/xen/xenbus/xenbus_client.c |  60 +++++++++++++++++
+ include/xen/xen.h                  |   7 ++
+ 3 files changed, 149 insertions(+), 21 deletions(-)
+
+diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c
+index 1dc0b495c8e5..ea9c76895459 100644
+--- a/drivers/xen/unpopulated-alloc.c
++++ b/drivers/xen/unpopulated-alloc.c
+@@ -4,6 +4,7 @@
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
+ #include <linux/memremap.h>
++#include <linux/numa.h>
+ #include <linux/slab.h>
+ 
+ #include <asm/page.h>
+@@ -12,12 +13,31 @@
+ #include <xen/page.h>
+ #include <xen/xen.h>
+ 
++/*
++ * Free pages are kept on per-node lists indexed by Linux node id.  Each
++ * fill_list() call grabs a fresh section-aligned IOMEM region and
++ * registers it with memremap_pages() against a specific node, so all
++ * struct pages in that section report that node via page_to_nid().
++ * Backends that learn the host node of a foreign frame (via
++ * xenbus_ring_host_node) can request placeholder pages from the
++ * matching pool so page_to_nid agrees with the actual host placement.
++ *
++ * A single mutex covers all per-node lists.  Alloc/free are
++ * connect-time events on the I/O backends and not contention-sensitive.
++ */
+ static DEFINE_MUTEX(list_lock);
+-static struct page *page_list;
+-static unsigned int list_count;
++static struct page *page_list[MAX_NUMNODES];
++static unsigned int list_count[MAX_NUMNODES];
+ 
+ static struct resource *target_resource;
+ 
++static int xen_unpopulated_clamp_node(int node)
++{
++	if (node == NUMA_NO_NODE || node < 0 || node >= MAX_NUMNODES)
++		return numa_node_id();
++	return node;
++}
++
+ /* Pages to subtract from the memory count when setting balloon target. */
+ unsigned long xen_unpopulated_pages __initdata;
+ 
+@@ -34,7 +54,7 @@ int __weak __init arch_xen_unpopulated_init(struct resource **res)
+ 	return 0;
+ }
+ 
+-static int fill_list(unsigned int nr_pages)
++static int fill_list(unsigned int nr_pages, int node)
+ {
+ 	struct dev_pagemap *pgmap;
+ 	struct resource *res, *tmp_res = NULL;
+@@ -121,7 +141,15 @@ static int fill_list(unsigned int nr_pages)
+ 	}
+ #endif
+ 
+-	vaddr = memremap_pages(pgmap, NUMA_NO_NODE);
++	/*
++	 * Register the section against @node so page_to_nid() of any
++	 * page in this section reports that value.  Grant-map operations
++	 * later install foreign MFNs into these slots; as long as the
++	 * caller picks the section matching the foreign MFN's host node
++	 * (which is the contract callers of xen_alloc_unpopulated_pages_node
++	 * are expected to honour), page_to_nid is correct by construction.
++	 */
++	vaddr = memremap_pages(pgmap, node);
+ 	if (IS_ERR(vaddr)) {
+ 		pr_err("Cannot remap memory range\n");
+ 		ret = PTR_ERR(vaddr);
+@@ -131,9 +159,9 @@ static int fill_list(unsigned int nr_pages)
+ 	for (i = 0; i < alloc_pages; i++) {
+ 		struct page *pg = virt_to_page(vaddr + PAGE_SIZE * i);
+ 
+-		pg->zone_device_data = page_list;
+-		page_list = pg;
+-		list_count++;
++		pg->zone_device_data = page_list[node];
++		page_list[node] = pg;
++		list_count[node]++;
+ 	}
+ 
+ 	return 0;
+@@ -153,12 +181,21 @@ static int fill_list(unsigned int nr_pages)
+ }
+ 
+ /**
+- * xen_alloc_unpopulated_pages - alloc unpopulated pages
++ * xen_alloc_unpopulated_pages_node - alloc unpopulated pages on a node
+  * @nr_pages: Number of pages
+  * @pages: pages returned
+- * @return 0 on success, error otherwise
++ * @node: Preferred Linux node id, or NUMA_NO_NODE for current CPU's node
++ *
++ * The returned pages are drawn from a per-node pool registered with
++ * memremap_pages() against @node, so page_to_nid() reports @node for
++ * every returned page.  Callers that know the host node of a foreign
++ * frame should pass it here to keep page_to_nid in agreement with the
++ * actual host placement after a subsequent grant-map.
++ *
++ * Returns 0 on success, error otherwise.
+  */
+-int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
++int xen_alloc_unpopulated_pages_node(unsigned int nr_pages, struct page **pages,
++				     int node)
+ {
+ 	unsigned int i;
+ 	int ret = 0;
+@@ -171,19 +208,21 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
+ 	if (!target_resource)
+ 		return xen_alloc_ballooned_pages(nr_pages, pages);
+ 
++	node = xen_unpopulated_clamp_node(node);
++
+ 	mutex_lock(&list_lock);
+-	if (list_count < nr_pages) {
+-		ret = fill_list(nr_pages - list_count);
++	if (list_count[node] < nr_pages) {
++		ret = fill_list(nr_pages - list_count[node], node);
+ 		if (ret)
+ 			goto out;
+ 	}
+ 
+ 	for (i = 0; i < nr_pages; i++) {
+-		struct page *pg = page_list;
++		struct page *pg = page_list[node];
+ 
+ 		BUG_ON(!pg);
+-		page_list = pg->zone_device_data;
+-		list_count--;
++		page_list[node] = pg->zone_device_data;
++		list_count[node]--;
+ 		pages[i] = pg;
+ 
+ #ifdef CONFIG_XEN_HAVE_PVMMU
+@@ -193,9 +232,9 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
+ 				unsigned int j;
+ 
+ 				for (j = 0; j <= i; j++) {
+-					pages[j]->zone_device_data = page_list;
+-					page_list = pages[j];
+-					list_count++;
++					pages[j]->zone_device_data = page_list[node];
++					page_list[node] = pages[j];
++					list_count[node]++;
+ 				}
+ 				goto out;
+ 			}
+@@ -207,12 +246,32 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
+ 	mutex_unlock(&list_lock);
+ 	return ret;
+ }
++EXPORT_SYMBOL(xen_alloc_unpopulated_pages_node);
++
++/**
++ * xen_alloc_unpopulated_pages - alloc unpopulated pages
++ * @nr_pages: Number of pages
++ * @pages: pages returned
++ * @return 0 on success, error otherwise
++ *
++ * Equivalent to xen_alloc_unpopulated_pages_node() with the current
++ * CPU's node as the preference.
++ */
++int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
++{
++	return xen_alloc_unpopulated_pages_node(nr_pages, pages, numa_node_id());
++}
+ EXPORT_SYMBOL(xen_alloc_unpopulated_pages);
+ 
+ /**
+  * xen_free_unpopulated_pages - return unpopulated pages
+  * @nr_pages: Number of pages
+  * @pages: pages to return
++ *
++ * Each page returns to the pool of the node it was originally allocated
++ * from, identified via page_to_nid().  Sections registered to a
++ * specific node yield pages whose nid reflects that node, so freed
++ * pages naturally land back in the matching list.
+  */
+ void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages)
+ {
+@@ -225,9 +284,11 @@ void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages)
+ 
+ 	mutex_lock(&list_lock);
+ 	for (i = 0; i < nr_pages; i++) {
+-		pages[i]->zone_device_data = page_list;
+-		page_list = pages[i];
+-		list_count++;
++		int node = xen_unpopulated_clamp_node(page_to_nid(pages[i]));
++
++		pages[i]->zone_device_data = page_list[node];
++		page_list[node] = pages[i];
++		list_count[node]++;
+ 	}
+ 	mutex_unlock(&list_lock);
+ }
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index 8e0695ba39a3..7f94b135727a 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -780,6 +780,66 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 		node->host_node = xenbus_query_mfn_node(
+ 			PFN_DOWN(info->map[0].dev_bus_addr));
+ 
++	/*
++	 * Placeholder pages came from numa_node_id()'s pool, which only
++	 * matches the foreign frame's node by coincidence.  If they
++	 * disagree, drop the mapping, return the placeholders, and redo
++	 * the map with placeholders drawn from the correct pool.  After
++	 * this, page_to_nid() of every ring page equals the host node of
++	 * its foreign MFN by construction, which keeps grant-mapped pages
++	 * truthful to every NUMA-aware code path that consults page_to_nid.
++	 *
++	 * The cost is one extra grant unmap + map pair per backend
++	 * connect (a rare event) and is paid only when the placeholder
++	 * pool's node disagrees with the foreign frame.  PV mappings and
++	 * cases where Xen cannot supply node info skip the dance entirely
++	 * (host_node stays NUMA_NO_NODE).
++	 */
++	if (node->host_node != NUMA_NO_NODE &&
++	    page_to_nid(node->hvm.pages[0]) != node->host_node) {
++		int relocate_err;
++
++		relocate_err = xenbus_unmap_ring(dev, node->handles, nr_grefs,
++						 info->addrs);
++		if (relocate_err != GNTST_okay) {
++			/*
++			 * Partial unmap: at least one grant may still be
++			 * live against a placeholder we can no longer
++			 * reach safely.  Mark the pages leaked and fail
++			 * the whole map.
++			 */
++			leaked = true;
++			err = -EIO;
++			goto out_free_ballooned_pages;
++		}
++
++		xen_free_unpopulated_pages(nr_pages, node->hvm.pages);
++
++		err = xen_alloc_unpopulated_pages_node(nr_pages,
++						       node->hvm.pages,
++						       node->host_node);
++		if (err) {
++			/*
++			 * Pages already gone; clear the array so the
++			 * cleanup path does not try to free them again.
++			 */
++			memset(node->hvm.pages, 0,
++			       nr_pages * sizeof(*node->hvm.pages));
++			node->nr_handles = 0;
++			goto out_err;
++		}
++
++		info->idx = 0;
++		gnttab_foreach_grant(node->hvm.pages, nr_grefs,
++				     xenbus_map_ring_setup_grant_hvm,
++				     info);
++
++		err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles,
++					info, GNTMAP_host_map, &leaked);
++		if (err)
++			goto out_free_ballooned_pages;
++	}
++
+ 	addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP,
+ 		    PAGE_KERNEL);
+ 	if (!addr) {
+diff --git a/include/xen/xen.h b/include/xen/xen.h
+index f280c5dcf923..f38cb138d837 100644
+--- a/include/xen/xen.h
++++ b/include/xen/xen.h
+@@ -70,6 +70,8 @@ extern u64 xen_saved_max_mem_size;
+ 
+ #ifdef CONFIG_XEN_UNPOPULATED_ALLOC
+ extern unsigned long xen_unpopulated_pages;
++int xen_alloc_unpopulated_pages_node(unsigned int nr_pages, struct page **pages,
++				     int node);
+ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages);
+ void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages);
+ #include <linux/ioport.h>
+@@ -77,6 +79,11 @@ int arch_xen_unpopulated_init(struct resource **res);
+ #else
+ #define xen_unpopulated_pages 0UL
+ #include <xen/balloon.h>
++static inline int xen_alloc_unpopulated_pages_node(unsigned int nr_pages,
++		struct page **pages, int node)
++{
++	return xen_alloc_ballooned_pages(nr_pages, pages);
++}
+ static inline int xen_alloc_unpopulated_pages(unsigned int nr_pages,
+ 		struct page **pages)
+ {
+-- 
+2.54.0
+
diff --git a/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch b/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch
new file mode 100644
index 0000000..d872882
--- /dev/null
+++ b/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch
@@ -0,0 +1,158 @@
+From 14a4b2b5f12fa8359b84e48ec01586285d1f71e9 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 19:09:31 -0700
+Subject: [PATCH 6/9] xen/xenbus: collapse xenbus_ring_host_node to a
+ page_to_nid wrapper
+
+Now that xen_alloc_unpopulated_pages places ring placeholders on the
+foreign frame's host node and xenbus_map_ring_hvm relocates the
+placeholders post-map when they land in the wrong pool, page_to_nid
+of any vmap'd ring page in PVH dom0 already reports the host node of
+its foreign MFN.  xenbus_ring_host_node can therefore drop the
+list-walk + cached integer + hypercall plumbing and become a thin
+wrapper around vmalloc_to_page() + page_to_nid().
+
+The XENMEM_get_mfn_pxms hypercall is still issued at map time inside
+xenbus_map_ring_hvm to learn the foreign frame's host node before
+deciding whether to relocate.  The decision now uses a local variable
+rather than caching the value on struct xenbus_map_node, so the
+host_node field is removed entirely.
+
+PV dom0 keeps the early NUMA_NO_NODE return: PV mappings install
+foreign MFNs directly in the PTEs and have no struct page in dom0's
+mem_map for the foreign frame, so vmalloc_to_page on a PV ring vaddr
+returns the placeholder for the wrong frame (or NULL).  PV dom0 is
+not a NUMA-affinity target on Edera in any case.
+
+Backend call sites do not change: netback and blkback continue to
+call xenbus_ring_host_node exactly as before.  The work moves from
+"walk a list keyed on vaddr, return a cached integer set at map time"
+to "look up the struct page for vaddr, return its nid".  Cheaper at
+runtime; no longer needs xenbus_valloc_lock; same return value.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/xen/xenbus/xenbus_client.c | 55 ++++++++++++++----------------
+ 1 file changed, 25 insertions(+), 30 deletions(-)
+
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index 7f94b135727a..f357c4a0372f 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -74,7 +74,6 @@ struct xenbus_map_node {
+ 	};
+ 	grant_handle_t handles[XENBUS_MAX_RING_GRANTS];
+ 	unsigned int   nr_handles;
+-	int            host_node;	/* Linux node id of foreign frame, or NUMA_NO_NODE */
+ };
+ 
+ struct map_ring_valloc {
+@@ -748,12 +747,11 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ {
+ 	struct xenbus_map_node *node = info->node;
+ 	int err;
++	int host_node = NUMA_NO_NODE;
+ 	void *addr;
+ 	bool leaked = false;
+ 	unsigned int nr_pages = XENBUS_PAGES(nr_grefs);
+ 
+-	node->host_node = NUMA_NO_NODE;
+-
+ 	err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages);
+ 	if (err)
+ 		goto out_err;
+@@ -770,14 +768,13 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 		goto out_free_ballooned_pages;
+ 
+ 	/*
+-	 * Xen unconditionally fills dev_bus_addr with the foreign frame's
+-	 * machine address on a successful host_map (see grant_table.c in
+-	 * the hypervisor).  Pick up the first ring page's MFN and resolve
+-	 * it now while we still have the map info; the result is cached on
+-	 * the xenbus_map_node so backends can look it up cheaply later.
++	 * Xen fills dev_bus_addr with the foreign frame's machine
++	 * address on a successful host_map (see grant_table.c in the
++	 * hypervisor).  Resolve the host node now so we know whether the
++	 * placeholders need to be relocated below.
+ 	 */
+ 	if (nr_grefs > 0)
+-		node->host_node = xenbus_query_mfn_node(
++		host_node = xenbus_query_mfn_node(
+ 			PFN_DOWN(info->map[0].dev_bus_addr));
+ 
+ 	/*
+@@ -792,11 +789,10 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 	 * The cost is one extra grant unmap + map pair per backend
+ 	 * connect (a rare event) and is paid only when the placeholder
+ 	 * pool's node disagrees with the foreign frame.  PV mappings and
+-	 * cases where Xen cannot supply node info skip the dance entirely
+-	 * (host_node stays NUMA_NO_NODE).
++	 * cases where Xen cannot supply node info skip the dance entirely.
+ 	 */
+-	if (node->host_node != NUMA_NO_NODE &&
+-	    page_to_nid(node->hvm.pages[0]) != node->host_node) {
++	if (host_node != NUMA_NO_NODE &&
++	    page_to_nid(node->hvm.pages[0]) != host_node) {
+ 		int relocate_err;
+ 
+ 		relocate_err = xenbus_unmap_ring(dev, node->handles, nr_grefs,
+@@ -817,7 +813,7 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev,
+ 
+ 		err = xen_alloc_unpopulated_pages_node(nr_pages,
+ 						       node->hvm.pages,
+-						       node->host_node);
++						       host_node);
+ 		if (err) {
+ 			/*
+ 			 * Pages already gone; clear the array so the
+@@ -892,22 +888,24 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+ 
+ int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr)
+ {
+-	struct xenbus_map_node *node;
+-	int node_id = NUMA_NO_NODE;
++	struct page *page;
+ 
+-	spin_lock(&xenbus_valloc_lock);
+-	list_for_each_entry(node, &xenbus_valloc_pages, next) {
+-		void *addr = xen_pv_domain() ? node->pv.area->addr
+-					     : node->hvm.addr;
++	/*
++	 * PV mappings install foreign MFNs directly in the PTEs and have
++	 * no struct page in dom0's mem_map for the foreign frame.  PVH
++	 * dom0 keeps a placeholder struct page (allocated from the
++	 * matching per-node pool of xen_alloc_unpopulated_pages_node)
++	 * whose page_to_nid() reports the host node of the foreign frame
++	 * by construction.
++	 */
++	if (xen_pv_domain())
++		return NUMA_NO_NODE;
+ 
+-		if (addr == vaddr) {
+-			node_id = node->host_node;
+-			break;
+-		}
+-	}
+-	spin_unlock(&xenbus_valloc_lock);
++	page = vmalloc_to_page(vaddr);
++	if (!page)
++		return NUMA_NO_NODE;
+ 
+-	return node_id;
++	return page_to_nid(page);
+ }
+ EXPORT_SYMBOL_GPL(xenbus_ring_host_node);
+ 
+@@ -931,9 +929,6 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev,
+ 	bool leaked = false;
+ 	int err = -ENOMEM;
+ 
+-	/* PV dom0 is not a NUMA-affinity target; leave the value unset. */
+-	node->host_node = NUMA_NO_NODE;
+-
+ 	area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP);
+ 	if (!area)
+ 		return -ENOMEM;
+-- 
+2.54.0
+
diff --git a/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch b/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch
new file mode 100644
index 0000000..188df71
--- /dev/null
+++ b/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch
@@ -0,0 +1,217 @@
+From 150666b04dd7ebdef6641d9a09f0d90bfbf801aa Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 19:17:39 -0700
+Subject: [PATCH 7/9] xen/xenbus: add xenbus_setup_ring_node for per-node ring
+ allocation
+
+Frontend drivers today route every ring allocation through
+xenbus_setup_ring, which calls alloc_pages_exact with no node
+preference.  The result is that every PV ring -- across all queues
+of all multi-queue devices on a guest -- ends up on whichever node
+the xenbus watch handler happens to run on, typically a single fixed
+value at boot.  Multi-queue devices on multi-vnode guests therefore
+defeat the dom0 backend's per-queue NUMA affinity work: all of dom0's
+backend kthreads cluster on one host node because all of the guest's
+rings live on one guest node, which maps to one host node.
+
+Add a node-aware variant.  xenbus_setup_ring_node(dev, gfp, node, ...)
+takes a Linux node id and draws the ring pages from that node's buddy
+list.  xenbus_setup_ring() is now a thin wrapper that passes
+NUMA_NO_NODE, preserving existing behaviour for every caller until
+they opt in.
+
+The same-ring locality property is preserved by construction.  A
+single buddy allocation comes from a single node's free list, so all
+pages of one ring remain on one node regardless of which node was
+requested.  The new variant only changes which node that is.
+
+alloc_pages_exact_nid is __meminit-restricted and not exported, so
+the body cannot just delegate to it.  Use alloc_pages_node() (which
+is exported and runtime-safe) to get an order-N block on the target
+node, then split_page() it so every subpage carries an independent
+refcount, and free any tail pages beyond ring_size back to the
+allocator.
+
+Also add xenbus_node_for_queue(index), a helper that rotates over the
+set of nodes with online CPUs.  Frontend callers feed it a per-queue
+or per-ring index to pick the node they pass to
+xenbus_setup_ring_node.  The natural shape is
+cpumask_local_spread(i, NUMA_NO_NODE), but with a NUMA_NO_NODE node
+argument that falls back to a linear walk of cpu_online_mask (see
+sched_numa_find_nth_cpu) and collapses every queue onto the first
+node's CPUs.  Going through num_node_state(N_CPU) and
+for_each_node_state(node, N_CPU) actually rotates.  Living in
+xenbus_client.c rather than open-coded in each driver lets future
+scsiback / pvcalls-back style frontends pick it up for free.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/xen/xenbus/xenbus_client.c | 105 ++++++++++++++++++++++++++---
+ include/xen/xenbus.h               |   4 ++
+ 2 files changed, 99 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index f357c4a0372f..7419a8183903 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -31,6 +31,7 @@
+  */
+ 
+ #include <linux/mm.h>
++#include <linux/nodemask.h>
+ #include <linux/numa.h>
+ #include <linux/slab.h>
+ #include <linux/types.h>
+@@ -448,33 +449,103 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+ }
+ 
+ /*
+- * xenbus_setup_ring
++ * xenbus_setup_ring_node
+  * @dev: xenbus device
++ * @gfp: GFP flags for the allocation
++ * @node: preferred Linux node id for the ring pages, or NUMA_NO_NODE
+  * @vaddr: pointer to starting virtual address of the ring
+  * @nr_pages: number of pages to be granted
+  * @grefs: grant reference array to be filled in
+  *
+- * Allocate physically contiguous pages for a shared ring buffer and grant it
+- * to the peer of the given device. The ring buffer is initially filled with
+- * zeroes. The virtual address of the ring is stored at @vaddr and the
+- * grant references are stored in the @grefs array. In case of error @vaddr
+- * will be set to NULL and @grefs will be filled with INVALID_GRANT_REF.
++ * Same contract as xenbus_setup_ring(), but the ring pages are drawn
++ * from @node's buddy free list when possible (subject to fallback when
++ * @node has no available memory).  All pages of a single ring come
++ * from one buddy allocation so they remain on a single node by
++ * construction, which is the property frontends rely on to keep
++ * per-queue rings on per-queue nodes.
++ *
++ * The ring buffer is initially filled with zeroes.  The virtual address
++ * of the ring is stored at @vaddr and the grant references are stored
++ * in the @grefs array.  In case of error @vaddr will be set to NULL and
++ * @grefs will be filled with INVALID_GRANT_REF.
+  */
+-int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr,
+-		      unsigned int nr_pages, grant_ref_t *grefs)
++/*
++ * Pick a Linux node id from the set of nodes with online CPUs, cycling
++ * by @index.  Frontends use this to distribute per-queue rings across
++ * guest NUMA nodes so the dom0 backend's per-ring placement lands them
++ * on distinct host nodes.
++ *
++ * cpumask_local_spread(i, NUMA_NO_NODE) is the natural shape this code
++ * wants, but with a NUMA_NO_NODE node argument it falls back to a
++ * straight linear walk of cpu_online_mask (see sched_numa_find_nth_cpu)
++ * which collapses every queue onto the first node's CPUs.  This helper
++ * actually rotates over nodes.
++ */
++int xenbus_node_for_queue(unsigned int index)
++{
++	unsigned int idx = 0;
++	unsigned int n;
++	int node;
++
++	n = num_node_state(N_CPU);
++	if (n == 0)
++		return NUMA_NO_NODE;
++
++	index %= n;
++	for_each_node_state(node, N_CPU) {
++		if (idx == index)
++			return node;
++		idx++;
++	}
++	return NUMA_NO_NODE;
++}
++EXPORT_SYMBOL_GPL(xenbus_node_for_queue);
++
++int xenbus_setup_ring_node(struct xenbus_device *dev, gfp_t gfp, int node,
++			   void **vaddr, unsigned int nr_pages,
++			   grant_ref_t *grefs)
+ {
+ 	unsigned long ring_size = nr_pages * XEN_PAGE_SIZE;
++	unsigned int order;
++	unsigned long nr_alloc;
++	struct page *page;
+ 	grant_ref_t gref_head;
+ 	unsigned int i;
+ 	void *addr;
+ 	int ret;
+ 
+-	addr = *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO);
+-	if (!*vaddr) {
++	*vaddr = NULL;
++
++	/*
++	 * Mirror the GFP filtering that alloc_pages_exact() does
++	 * internally: split_page() below requires a non-compound page
++	 * and HIGHMEM is incompatible with the direct virt mapping used
++	 * by the grant code.
++	 */
++	gfp &= ~(__GFP_COMP | __GFP_HIGHMEM);
++
++	order = get_order(ring_size);
++	page = alloc_pages_node(node, gfp | __GFP_ZERO, order);
++	if (!page) {
+ 		ret = -ENOMEM;
+ 		goto err;
+ 	}
+ 
++	/*
++	 * alloc_pages_node returns a single order-N block where only
++	 * the head is refcounted.  split_page makes every subpage
++	 * individually refcounted so free_pages_exact() can release the
++	 * ring page-by-page.  Return any tail pages beyond ring_size to
++	 * the allocator immediately.
++	 */
++	split_page(page, order);
++	nr_alloc = 1UL << order;
++	for (i = DIV_ROUND_UP(ring_size, PAGE_SIZE); i < nr_alloc; i++)
++		__free_page(page + i);
++
++	addr = page_address(page);
++	*vaddr = addr;
++
+ 	ret = gnttab_alloc_grant_references(nr_pages, &gref_head);
+ 	if (ret) {
+ 		xenbus_dev_fatal(dev, ret, "granting access to %u ring pages",
+@@ -508,6 +579,20 @@ int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr,
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(xenbus_setup_ring_node);
++
++/*
++ * xenbus_setup_ring
++ *
++ * Equivalent to xenbus_setup_ring_node() with no node preference; the
++ * pages come from the current CPU's local node by default GFP policy.
++ */
++int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr,
++		      unsigned int nr_pages, grant_ref_t *grefs)
++{
++	return xenbus_setup_ring_node(dev, gfp, NUMA_NO_NODE, vaddr, nr_pages,
++				      grefs);
++}
+ EXPORT_SYMBOL_GPL(xenbus_setup_ring);
+ 
+ /*
+diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
+index 18b902bf79ef..8ce096797b86 100644
+--- a/include/xen/xenbus.h
++++ b/include/xen/xenbus.h
+@@ -216,6 +216,10 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
+ 			 const char *pathfmt, ...);
+ 
+ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
++int xenbus_node_for_queue(unsigned int index);
++int xenbus_setup_ring_node(struct xenbus_device *dev, gfp_t gfp, int node,
++			   void **vaddr, unsigned int nr_pages,
++			   grant_ref_t *grefs);
+ int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr,
+ 		      unsigned int nr_pages, grant_ref_t *grefs);
+ void xenbus_teardown_ring(void **vaddr, unsigned int nr_pages,
+-- 
+2.54.0
+
diff --git a/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch b/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch
new file mode 100644
index 0000000..7d63b90
--- /dev/null
+++ b/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch
@@ -0,0 +1,270 @@
+From 2dd13fab395903f26e7f37eaeacec05c8fdd0233 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 19:21:52 -0700
+Subject: [PATCH 8/9] xen-netfront: place per-queue rings on per-queue nodes,
+ with XPS
+
+Today every netfront queue allocates its tx and rx rings from
+xenbus_setup_ring(), which has no node preference and pulls from the
+buddy free list of whichever node the xenbus watch handler is on.
+On a multi-queue device, every queue's rings end up on the same
+guest node.  Combined with vNUMA->host-node mapping that puts each
+guest node on its own host node, this funnels all of dom0's
+per-queue backend kthreads onto a single host node and defeats the
+multi-queue parallelism the dom0 backend is otherwise prepared to
+deliver.
+
+Pick a per-queue node with xenbus_node_for_queue(queue->id) and pass
+it to xenbus_setup_ring_node() for both tx and rx rings.  Same-ring
+locality is preserved by construction (one buddy allocation comes
+from one node); different rings of different queues now land on
+different nodes on multi-vnode guests.  Single-vnode guests
+degenerate to node 0 for every queue, identical to the previous
+behaviour.
+
+Thread the same node through setup_netfront_split() and
+setup_netfront_single() so the per-queue evtchn IRQs are bound with
+bind_evtchn_to_irqhandler_lateeoi_on_node().  The underlying desc is
+then allocated with the right node attribute and irqbalance treats
+each IRQ as NUMA-local rather than floating.
+
+Apply irq_set_affinity_and_hint() to each queue's tx/rx IRQ at
+connect time using the same per-queue node.  NAPI runs in softirq on
+the CPU that took the IRQ; landing IRQ + NAPI + ring on one node
+keeps the receive path NUMA-local.  Sets both actual affinity and
+hint so behaviour is correct on guests without irqbalance; operator
+writes to /proc/irq/N/smp_affinity still win.
+
+Install an XPS map for each queue mapping the node's cpumask to that
+queue's index.  Without this, __netdev_pick_tx falls back to
+hash-based queue selection and a sender on any CPU can land on any
+queue regardless of where its data wants to live; the ring-placement
+work above is then wasted because the actual sender-to-queue
+pairing is random.  With XPS in place, a sender on a CPU in node N
+selects queue N, whose rings are on node N, whose dom0 backend
+kthread is on the host node hosting those rings: an end-to-end
+NUMA-local TX path with no cross-node payload movement up to the
+hypervisor boundary.
+
+In xennet_disconnect_backend(), clear the hint with
+irq_update_affinity_hint(irq, NULL) before each unbind_from_irqhandler
+in both the shared- and split-evtchn paths.  free_irq() warns if the
+hint is still set at teardown (kernel/irq/manage.c:1865); reconnects
+would otherwise WARN.
+
+Operator writes to /proc/sys/net/.../xps_cpus continue to win on
+subsequent writes -- this only provides a sensible default.  XPS is
+also conditional: CONFIG_XPS off makes netif_set_xps_queue a stub,
+and an empty cpumask (memory-only NUMA node, possible if unusual on
+guests) skips the install to avoid programming an effectively-unusable
+map.  On reconnect, the new setup_netfront calls overwrite the XPS
+map with fresh values; the netdev-scoped map is freed when the
+netdev unregisters.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/net/xen-netfront.c | 107 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 88 insertions(+), 19 deletions(-)
+
+diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
+index a11a0e949400..7c18db3de8d4 100644
+--- a/drivers/net/xen-netfront.c
++++ b/drivers/net/xen-netfront.c
+@@ -31,10 +31,12 @@
+ 
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
++#include <linux/cpumask.h>
+ #include <linux/module.h>
+ #include <linux/kernel.h>
+ #include <linux/netdevice.h>
+ #include <linux/etherdevice.h>
++#include <linux/numa.h>
+ #include <linux/skbuff.h>
+ #include <linux/ethtool.h>
+ #include <linux/if_ether.h>
+@@ -1824,9 +1826,17 @@ static void xennet_disconnect_backend(struct netfront_info *info)
+ 
+ 		timer_delete_sync(&queue->rx_refill_timer);
+ 
+-		if (queue->tx_irq && (queue->tx_irq == queue->rx_irq))
++		/*
++		 * free_irq() warns if affinity_hint is still set.  Drop the
++		 * hint installed at connect time before tearing the IRQ down.
++		 */
++		if (queue->tx_irq && (queue->tx_irq == queue->rx_irq)) {
++			irq_update_affinity_hint(queue->tx_irq, NULL);
+ 			unbind_from_irqhandler(queue->tx_irq, queue);
++		}
+ 		if (queue->tx_irq && (queue->tx_irq != queue->rx_irq)) {
++			irq_update_affinity_hint(queue->tx_irq, NULL);
++			irq_update_affinity_hint(queue->rx_irq, NULL);
+ 			unbind_from_irqhandler(queue->tx_irq, queue);
+ 			unbind_from_irqhandler(queue->rx_irq, queue);
+ 		}
+@@ -1902,7 +1912,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+ 	return 0;
+ }
+ 
+-static int setup_netfront_single(struct netfront_queue *queue)
++static int setup_netfront_single(struct netfront_queue *queue, int node)
+ {
+ 	int err;
+ 
+@@ -1910,10 +1920,10 @@ static int setup_netfront_single(struct netfront_queue *queue)
+ 	if (err < 0)
+ 		goto fail;
+ 
+-	err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn,
+-						xennet_interrupt, 0,
+-						queue->info->netdev->name,
+-						queue);
++	err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->tx_evtchn,
++							xennet_interrupt, 0,
++							queue->info->netdev->name,
++							queue, node);
+ 	if (err < 0)
+ 		goto bind_fail;
+ 	queue->rx_evtchn = queue->tx_evtchn;
+@@ -1928,7 +1938,7 @@ static int setup_netfront_single(struct netfront_queue *queue)
+ 	return err;
+ }
+ 
+-static int setup_netfront_split(struct netfront_queue *queue)
++static int setup_netfront_split(struct netfront_queue *queue, int node)
+ {
+ 	int err;
+ 
+@@ -1941,18 +1951,20 @@ static int setup_netfront_split(struct netfront_queue *queue)
+ 
+ 	snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name),
+ 		 "%s-tx", queue->name);
+-	err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn,
+-						xennet_tx_interrupt, 0,
+-						queue->tx_irq_name, queue);
++	err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->tx_evtchn,
++							xennet_tx_interrupt, 0,
++							queue->tx_irq_name,
++							queue, node);
+ 	if (err < 0)
+ 		goto bind_tx_fail;
+ 	queue->tx_irq = err;
+ 
+ 	snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name),
+ 		 "%s-rx", queue->name);
+-	err = bind_evtchn_to_irqhandler_lateeoi(queue->rx_evtchn,
+-						xennet_rx_interrupt, 0,
+-						queue->rx_irq_name, queue);
++	err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->rx_evtchn,
++							xennet_rx_interrupt, 0,
++							queue->rx_irq_name,
++							queue, node);
+ 	if (err < 0)
+ 		goto bind_rx_fail;
+ 	queue->rx_irq = err;
+@@ -1977,6 +1989,7 @@ static int setup_netfront(struct xenbus_device *dev,
+ {
+ 	struct xen_netif_tx_sring *txs;
+ 	struct xen_netif_rx_sring *rxs;
++	int node;
+ 	int err;
+ 
+ 	queue->tx_ring_ref = INVALID_GRANT_REF;
+@@ -1984,32 +1997,88 @@ static int setup_netfront(struct xenbus_device *dev,
+ 	queue->rx.sring = NULL;
+ 	queue->tx.sring = NULL;
+ 
+-	err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&txs,
+-				1, &queue->tx_ring_ref);
++	/*
++	 * Distribute queues across guest NUMA nodes by rotating over
++	 * nodes-with-CPUs.  On a single-vnode guest every queue lands
++	 * on node 0 and behaviour matches the legacy default.  On a
++	 * multi-vnode guest, queues spread across nodes and pair up
++	 * naturally with the dom0 backend's per-queue node-affinity
++	 * placement.
++	 */
++	node = xenbus_node_for_queue(queue->id);
++
++	err = xenbus_setup_ring_node(dev, GFP_NOIO | __GFP_HIGH, node,
++				     (void **)&txs, 1, &queue->tx_ring_ref);
+ 	if (err)
+ 		goto fail;
+ 
+ 	XEN_FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE);
+ 
+-	err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&rxs,
+-				1, &queue->rx_ring_ref);
++	err = xenbus_setup_ring_node(dev, GFP_NOIO | __GFP_HIGH, node,
++				     (void **)&rxs, 1, &queue->rx_ring_ref);
+ 	if (err)
+ 		goto fail;
+ 
+ 	XEN_FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE);
+ 
+ 	if (feature_split_evtchn)
+-		err = setup_netfront_split(queue);
++		err = setup_netfront_split(queue, node);
+ 	/* setup single event channel if
+ 	 *  a) feature-split-event-channels == 0
+ 	 *  b) feature-split-event-channels == 1 but failed to setup
+ 	 */
+ 	if (!feature_split_evtchn || err)
+-		err = setup_netfront_single(queue);
++		err = setup_netfront_single(queue, node);
+ 
+ 	if (err)
+ 		goto fail;
+ 
++	/*
++	 * Route each per-queue evtchn IRQ toward the same node the ring
++	 * lives on.  NAPI runs in softirq on the CPU that took the IRQ;
++	 * landing IRQ + NAPI + ring on one node keeps the receive path
++	 * NUMA-local.  Sets both actual affinity and hint so behaviour
++	 * is correct on guests without irqbalance.  Operator writes to
++	 * /proc/irq/N/smp_affinity continue to win.
++	 */
++	if (node != NUMA_NO_NODE) {
++		const struct cpumask *mask = cpumask_of_node(node);
++
++		if (!cpumask_empty(mask)) {
++			irq_set_affinity_and_hint(queue->tx_irq, mask);
++			if (queue->rx_irq != queue->tx_irq)
++				irq_set_affinity_and_hint(queue->rx_irq, mask);
++		}
++	}
++
++	/*
++	 * Steer senders toward this queue based on the same node the
++	 * rings live on.  __netdev_pick_tx consults the XPS map first;
++	 * a sender on a CPU in `node` will pick `queue->id`, whose
++	 * rings are on `node`, and dom0's matching backend kthread is
++	 * pinned to the host node that hosts those rings.  Without
++	 * this, the kernel's default hash-based queue selection lets a
++	 * sender on any node land on any queue, defeating the per-queue
++	 * NUMA-locality story.  An operator write to xps_cpus
++	 * overrides on subsequent writes.
++	 *
++	 * netif_set_xps_queue silently returns 0 if CONFIG_XPS is off;
++	 * an empty cpumask (e.g. memory-only NUMA node) is skipped to
++	 * avoid programming an effectively-unusable map.
++	 */
++	if (node != NUMA_NO_NODE) {
++		const struct cpumask *mask = cpumask_of_node(node);
++
++		if (!cpumask_empty(mask)) {
++			int xps_err = netif_set_xps_queue(queue->info->netdev,
++							  mask, queue->id);
++			if (xps_err)
++				netdev_warn(queue->info->netdev,
++					    "XPS setup failed for queue %u: %d\n",
++					    queue->id, xps_err);
++		}
++	}
++
+ 	return 0;
+ 
+  fail:
+-- 
+2.54.0
+
diff --git a/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch b/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch
new file mode 100644
index 0000000..59637c1
--- /dev/null
+++ b/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch
@@ -0,0 +1,158 @@
+From 9f2eced2a6cb89bf0f8fcfc4889389880b595e39 Mon Sep 17 00:00:00 2001
+From: Steven Noonan <steven@edera.dev>
+Date: Tue, 19 May 2026 19:25:21 -0700
+Subject: [PATCH 9/9] xen-blkfront: place per-ring buffers on per-hctx nodes
+
+Today every blkfront ring (one per hctx in multi-queue mode) is
+allocated from xenbus_setup_ring() with no node preference, so all
+of a multi-queue blkfront's rings end up on whichever node the
+xenbus watch handler runs on.  Combined with vNUMA->host-node
+mapping, this funnels every dom0 backend xenblkd kthread onto a
+single host node and defeats multi-queue parallelism.
+
+Pick a per-ring node with xenbus_node_for_queue(ring_idx) and pass it
+to xenbus_setup_ring_node().  Different rings now land on different
+guest nodes on multi-vnode guests; same-ring locality is preserved
+by the underlying buddy allocation.
+
+The ring's index is recovered from its byte offset within
+info->rinfo because struct blkfront_ring_info has a flex array
+trailer and sizeof() is not the stride; the same arithmetic appears
+in get_rinfo() and for_each_rinfo().
+
+Bind the ring's evtchn IRQ on the ring's node using
+bind_evtchn_to_irqhandler_lateeoi_on_node().  The desc is then
+allocated with the right node attribute and irqbalance treats the
+IRQ as NUMA-local.
+
+Apply irq_set_affinity_and_hint() to the per-ring IRQ using the same
+node.  The IRQ fires on a CPU in the ring's node; blk-mq routes the
+completion onward to the original submitter via the request_irq
+machinery, so the wake-up edge stays on the right node before that
+handoff.  Sets both actual affinity and hint; operator writes to
+/proc/irq/N/smp_affinity continue to win.
+
+No XPS-equivalent steering is needed on the block side.  blk-mq's
+existing hctx-to-CPU map already routes submissions: a process
+running on CPU C submits via the hctx blk-mq mapped to C, and our
+ring node for that hctx is derived from the same node-rotation
+helper.  The submitter is on the same node as the ring by
+construction.
+
+In blkif_free_ring(), clear the hint with
+irq_update_affinity_hint(rinfo->irq, NULL) before
+unbind_from_irqhandler().  free_irq() warns if the hint is still set
+at teardown (kernel/irq/manage.c:1865); reconnects would otherwise
+WARN.
+
+Single-vnode guests are unaffected: every ring's node resolves to
+node 0 and behaviour matches the previous default.
+
+Signed-off-by: Steven Noonan <steven@edera.dev>
+---
+ drivers/block/xen-blkfront.c | 53 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 48 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
+index 04fc6b552c04..8414dc737157 100644
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -35,12 +35,14 @@
+  * IN THE SOFTWARE.
+  */
+ 
++#include <linux/cpumask.h>
+ #include <linux/interrupt.h>
+ #include <linux/blkdev.h>
+ #include <linux/blk-mq.h>
+ #include <linux/hdreg.h>
+ #include <linux/cdrom.h>
+ #include <linux/module.h>
++#include <linux/numa.h>
+ #include <linux/slab.h>
+ #include <linux/major.h>
+ #include <linux/mutex.h>
+@@ -1293,8 +1295,14 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
+ 	xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages,
+ 			     rinfo->ring_ref);
+ 
+-	if (rinfo->irq)
++	if (rinfo->irq) {
++		/*
++		 * free_irq() warns if affinity_hint is still set.  Drop the
++		 * hint installed at setup time before tearing the IRQ down.
++		 */
++		irq_update_affinity_hint(rinfo->irq, NULL);
+ 		unbind_from_irqhandler(rinfo->irq, rinfo);
++	}
+ 	rinfo->evtchn = rinfo->irq = 0;
+ }
+ 
+@@ -1684,9 +1692,29 @@ static int setup_blkring(struct xenbus_device *dev,
+ 	int err;
+ 	struct blkfront_info *info = rinfo->dev_info;
+ 	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
++	unsigned int ring_idx;
++	int node;
++
++	/*
++	 * Recover the ring index from its slot in info->rinfo.  The same
++	 * arithmetic is used by get_rinfo() and for_each_rinfo() (the
++	 * struct has a flex array so sizeof() is not the stride).
++	 */
++	ring_idx = ((unsigned long)rinfo - (unsigned long)info->rinfo) /
++		   info->rinfo_size;
++
++	/*
++	 * Distribute rings across guest NUMA nodes by rotating over
++	 * nodes-with-CPUs.  blk-mq's default hctx-to-CPU map is also
++	 * NUMA-balanced (blk_mq_map_queues uses NUMA-aware distribution
++	 * when topology is present), so a submitter on a node-N CPU
++	 * lands on the hctx whose ring is on node N.  No XPS-equivalent
++	 * steering needed on the block side -- blk-mq already does it.
++	 */
++	node = xenbus_node_for_queue(ring_idx);
+ 
+-	err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring,
+-				info->nr_ring_pages, rinfo->ring_ref);
++	err = xenbus_setup_ring_node(dev, GFP_NOIO, node, (void **)&sring,
++				     info->nr_ring_pages, rinfo->ring_ref);
+ 	if (err)
+ 		goto fail;
+ 
+@@ -1696,8 +1724,9 @@ static int setup_blkring(struct xenbus_device *dev,
+ 	if (err)
+ 		goto fail;
+ 
+-	err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt,
+-						0, "blkif", rinfo);
++	err = bind_evtchn_to_irqhandler_lateeoi_on_node(rinfo->evtchn,
++							blkif_interrupt, 0,
++							"blkif", rinfo, node);
+ 	if (err <= 0) {
+ 		xenbus_dev_fatal(dev, err,
+ 				 "bind_evtchn_to_irqhandler failed");
+@@ -1705,6 +1734,20 @@ static int setup_blkring(struct xenbus_device *dev,
+ 	}
+ 	rinfo->irq = err;
+ 
++	/*
++	 * Route the ring's evtchn IRQ toward the same node the ring
++	 * lives on.  blk-mq completes requests on the submitting CPU
++	 * via the request_irq path; firing the IRQ on the ring's node
++	 * keeps the wake-up on the right node before blk-mq routes the
++	 * completion onward.  Sets both actual affinity and hint.
++	 */
++	if (node != NUMA_NO_NODE) {
++		const struct cpumask *mask = cpumask_of_node(node);
++
++		if (!cpumask_empty(mask))
++			irq_set_affinity_and_hint(rinfo->irq, mask);
++	}
++
+ 	return 0;
+ fail:
+ 	blkif_free(info, 0);
+-- 
+2.54.0
+