diff --git a/config.yaml b/config.yaml index 6d275ca..2e639ed 100644 --- a/config.yaml +++ b/config.yaml @@ -93,6 +93,17 @@ patches: lower: '6.17' - patch: 0003-x86-amd_node-fix-null-pointer-dereference-if-amd_smn.patch lower: '6.17' +- patches: + - 0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch + - 0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch + - 0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch + - 0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch + - 0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch + - 0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch + - 0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch + - 0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch + - 0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch + lower: '6.18' images: - target: kernelsrc name: kernel-src diff --git a/configs/x86_64/host.config b/configs/x86_64/host.config index 3d28f1d..ce59dfc 100644 --- a/configs/x86_64/host.config +++ b/configs/x86_64/host.config @@ -4912,6 +4912,7 @@ CONFIG_XEN_BALLOON=y CONFIG_XEN_SCRUB_PAGES_DEFAULT=y CONFIG_XEN_DEV_EVTCHN=m CONFIG_XEN_BACKEND=y +CONFIG_XEN_BACKEND_NUMA_AFFINITY=y CONFIG_XENFS=y CONFIG_XEN_COMPAT_XENFS=y CONFIG_XEN_SYS_HYPERVISOR=y @@ -5991,3 +5992,11 @@ CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y CONFIG_MHP_MEMMAP_ON_MEMORY=y CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y CONFIG_XEN_MEMORY_HOTPLUG_LIMIT=512 + +# Enable device memory hotplug support (dependency for Xen PV driver backend +# NUMA binding) +CONFIG_ZONE_DEVICE=y + +# NUMA balancing support +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y diff --git a/configs/x86_64/zone.config b/configs/x86_64/zone.config index 7880eed..41606d5 100644 --- a/configs/x86_64/zone.config +++ b/configs/x86_64/zone.config @@ -1726,6 +1726,7 @@ CONFIG_XEN_MEMORY_HOTPLUG_LIMIT=512 CONFIG_XEN_SCRUB_PAGES_DEFAULT=y CONFIG_XEN_DEV_EVTCHN=y CONFIG_XEN_BACKEND=y +CONFIG_XEN_BACKEND_NUMA_AFFINITY=y CONFIG_XENFS=y CONFIG_XEN_COMPAT_XENFS=y CONFIG_XEN_SYS_HYPERVISOR=y @@ -2391,3 +2392,11 @@ CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_BTF_MODULES=y CONFIG_DEBUG_INFO_COMPRESSED_ZSTD=y + +# Enable device memory hotplug support (dependency for Xen PV driver backend +# NUMA binding) +CONFIG_ZONE_DEVICE=y + +# NUMA balancing support +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y diff --git a/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch b/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch new file mode 100644 index 0000000..e2b995e --- /dev/null +++ b/patches/0001-xen-events-add-_on_node-variants-of-the-lateeoi-bind.patch @@ -0,0 +1,298 @@ +From bd2dbbb1f3a05a5d77d18548d41cdb25c0b81912 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 23:00:20 -0700 +Subject: [PATCH 1/9] xen/events: add _on_node variants of the lateeoi bind + helpers + +xen_allocate_irq_dynamic() unconditionally calls +irq_alloc_desc_from(0, -1), so every Xen evtchn IRQ descriptor is +allocated with NUMA_NO_NODE. This means /proc/irq/N/node always +reads -1 even when the caller (netback, blkback, netfront, blkfront) +already knows the right node for the IRQ. + +irqbalance treats node=-1 as "no NUMA preference" and distributes +the IRQ across all CPUs for load balance, ignoring affinity_hint. +With irqbalance running, the per-queue NUMA placement we install via +irq_set_affinity_and_hint() gets overwritten almost immediately. + +Add _on_node variants of the four bind helpers Xen front/back ends +use: + + bind_evtchn_to_irq_lateeoi_on_node(evtchn, node) + bind_evtchn_to_irqhandler_lateeoi_on_node(..., node) + bind_interdomain_evtchn_to_irq_lateeoi_on_node(..., node) + bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node(..., node) + +Each passes the caller's node through to the internal chip helpers +and on to a new xen_allocate_irq_dynamic_node(node). The existing +public functions become thin wrappers passing NUMA_NO_NODE, so every +caller that hasn't been updated keeps today's behaviour. + +After this change, /proc/irq/N/node reflects the node the caller +asked for, and irqbalance respects affinity_hint as a NUMA-local +subset rather than treating the IRQ as floating. + +Signed-off-by: Steven Noonan +--- + drivers/xen/events/events_base.c | 86 ++++++++++++++++++++++++++------ + include/xen/events.h | 15 ++++++ + 2 files changed, 85 insertions(+), 16 deletions(-) + +diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c +index 9478fae014e5..6368ff561472 100644 +--- a/drivers/xen/events/events_base.c ++++ b/drivers/xen/events/events_base.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -729,9 +730,9 @@ static struct irq_info *xen_irq_init(unsigned int irq) + return info; + } + +-static struct irq_info *xen_allocate_irq_dynamic(void) ++static struct irq_info *xen_allocate_irq_dynamic_node(int node) + { +- int irq = irq_alloc_desc_from(0, -1); ++ int irq = irq_alloc_desc_from(0, node); + struct irq_info *info = NULL; + + if (irq >= 0) { +@@ -743,6 +744,11 @@ static struct irq_info *xen_allocate_irq_dynamic(void) + return info; + } + ++static struct irq_info *xen_allocate_irq_dynamic(void) ++{ ++ return xen_allocate_irq_dynamic_node(NUMA_NO_NODE); ++} ++ + static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi) + { + int irq; +@@ -1184,7 +1190,8 @@ int xen_pirq_from_irq(unsigned irq) + EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + + static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, +- struct xenbus_device *dev, bool shared) ++ struct xenbus_device *dev, bool shared, ++ int node) + { + int ret = -ENOMEM; + struct irq_info *info; +@@ -1197,7 +1204,7 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, + info = evtchn_to_info(evtchn); + + if (!info) { +- info = xen_allocate_irq_dynamic(); ++ info = xen_allocate_irq_dynamic_node(node); + if (!info) + goto out; + +@@ -1232,16 +1239,25 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, + + int bind_evtchn_to_irq(evtchn_port_t evtchn) + { +- return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false); ++ return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false, ++ NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + + int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) + { +- return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false); ++ return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false, ++ NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + ++int bind_evtchn_to_irq_lateeoi_on_node(evtchn_port_t evtchn, int node) ++{ ++ return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false, ++ node); ++} ++EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi_on_node); ++ + static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) + { + struct evtchn_bind_ipi bind_ipi; +@@ -1291,7 +1307,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) + static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, + evtchn_port_t remote_port, + struct irq_chip *chip, +- bool shared) ++ bool shared, int node) + { + struct evtchn_bind_interdomain bind_interdomain; + int err; +@@ -1303,17 +1319,28 @@ static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, +- chip, dev, shared); ++ chip, dev, shared, node); + } + + int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port) + { + return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, +- &xen_lateeoi_chip, false); ++ &xen_lateeoi_chip, false, ++ NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); + ++int bind_interdomain_evtchn_to_irq_lateeoi_on_node(struct xenbus_device *dev, ++ evtchn_port_t remote_port, ++ int node) ++{ ++ return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, ++ &xen_lateeoi_chip, false, ++ node); ++} ++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi_on_node); ++ + static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn, + bool percpu) + { +@@ -1432,12 +1459,12 @@ static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id, +- struct irq_chip *chip) ++ struct irq_chip *chip, int node) + { + int irq, retval; + + irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL, +- irqflags & IRQF_SHARED); ++ irqflags & IRQF_SHARED, node); + if (irq < 0) + return irq; + retval = request_irq(irq, handler, irqflags, devname, dev_id); +@@ -1456,7 +1483,8 @@ int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, + { + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, +- &xen_dynamic_chip); ++ &xen_dynamic_chip, ++ NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +@@ -1467,19 +1495,34 @@ int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + { + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, +- &xen_lateeoi_chip); ++ &xen_lateeoi_chip, ++ NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); + ++int bind_evtchn_to_irqhandler_lateeoi_on_node(evtchn_port_t evtchn, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id, int node) ++{ ++ return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, ++ devname, dev_id, ++ &xen_lateeoi_chip, node); ++} ++EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi_on_node); ++ + static int bind_interdomain_evtchn_to_irqhandler_chip( + struct xenbus_device *dev, evtchn_port_t remote_port, + irq_handler_t handler, unsigned long irqflags, +- const char *devname, void *dev_id, struct irq_chip *chip) ++ const char *devname, void *dev_id, struct irq_chip *chip, ++ int node) + { + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip, +- irqflags & IRQF_SHARED); ++ irqflags & IRQF_SHARED, ++ node); + if (irq < 0) + return irq; + +@@ -1501,10 +1544,21 @@ int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, + { + return bind_interdomain_evtchn_to_irqhandler_chip(dev, + remote_port, handler, irqflags, devname, +- dev_id, &xen_lateeoi_chip); ++ dev_id, &xen_lateeoi_chip, NUMA_NO_NODE); + } + EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi); + ++int bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( ++ struct xenbus_device *dev, evtchn_port_t remote_port, ++ irq_handler_t handler, unsigned long irqflags, ++ const char *devname, void *dev_id, int node) ++{ ++ return bind_interdomain_evtchn_to_irqhandler_chip(dev, ++ remote_port, handler, irqflags, devname, ++ dev_id, &xen_lateeoi_chip, node); ++} ++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node); ++ + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +diff --git a/include/xen/events.h b/include/xen/events.h +index de5da58a0205..1abc068557b4 100644 +--- a/include/xen/events.h ++++ b/include/xen/events.h +@@ -18,6 +18,7 @@ unsigned xen_evtchn_nr_channels(void); + + int bind_evtchn_to_irq(evtchn_port_t evtchn); + int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn); ++int bind_evtchn_to_irq_lateeoi_on_node(evtchn_port_t evtchn, int node); + int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, +@@ -26,6 +27,10 @@ int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, + void *dev_id); ++int bind_evtchn_to_irqhandler_lateeoi_on_node(evtchn_port_t evtchn, ++ irq_handler_t handler, ++ unsigned long irqflags, const char *devname, ++ void *dev_id, int node); + int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, +@@ -39,12 +44,22 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, + void *dev_id); + int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port); ++int bind_interdomain_evtchn_to_irq_lateeoi_on_node(struct xenbus_device *dev, ++ evtchn_port_t remote_port, ++ int node); + int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); ++int bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( ++ struct xenbus_device *dev, ++ evtchn_port_t remote_port, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id, int node); + + /* + * Common unbind function for all event sources. Takes IRQ to unbind from. +-- +2.54.0 + diff --git a/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch b/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch new file mode 100644 index 0000000..7482925 --- /dev/null +++ b/patches/0002-xen-xenbus-expose-host-NUMA-node-of-a-mapped-ring.patch @@ -0,0 +1,348 @@ +From 0cc05d6a3795ccb1f832afde9920c74c12328368 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 18:35:49 -0700 +Subject: [PATCH 2/9] xen/xenbus: expose host NUMA node of a mapped ring + +Xenbus backends in PVH dom0 today have no visibility into which host +NUMA node the foreign frame backing a grant-mapped ring lives on. +xen_alloc_unpopulated_pages() registers its pool with NUMA_NO_NODE, +so page_to_nid() of any grant-mapped page reports NUMA_NO_NODE and +backends have nothing to drive kthread or IRQ placement off of. The +result is that every backend kthread for every queue piles onto one +host node regardless of the guest's vNUMA layout. + +Add xenbus_ring_host_node(dev, vaddr) returning the Linux node id of +the host node hosting the first ring page, or NUMA_NO_NODE when the +information is unavailable. The value is resolved once at map time +using the new XENMEM_get_mfn_pxms hypercall and cached on the +xenbus_map_node, so the public helper is a cheap list lookup. + +Three NUMA identifier namespaces are involved in this code path: host +PXM (firmware), Xen-internal nid (assigned in SRAT scan order), and +dom0 Linux node id. The hypercall returns host PXM, matching what +dom0's own SRAT already uses; pxm_to_node() converts to a Linux node +id, which is what backends will feed to cpumask_of_node() and +kthread_create_on_node(). Keeping the Xen-side ABI in PXM-space lets +callers translate with one standard lookup instead of maintaining +their own Xen-nid -> Linux-node table. + +Older or non-Edera hypervisors lack the new hypercall. The first +call there returns -ENOSYS, which latches a global "unsupported" +flag; every subsequent xenbus_ring_host_node() returns NUMA_NO_NODE +without issuing further hypercalls. Backend patches that key off the +helper see NUMA_NO_NODE and silently fall back to today's +NUMA-oblivious behaviour. This makes the kernel-side change safe to +ship without a lockstep hypervisor update. + +Gated by CONFIG_XEN_BACKEND_NUMA_AFFINITY, which depends on +XEN_BACKEND, NUMA, ACPI_NUMA, and XEN_UNPOPULATED_ALLOC and defaults +to y. XEN_UNPOPULATED_ALLOC is a hard dependency rather than a soft +one because the per-ring placement work the rest of this series +performs is only meaningful when grant-map placeholders come from the +per-node pool registered via memremap_pages(pgmap, node). Without +XEN_UNPOPULATED_ALLOC, placeholders come from the generic balloon +allocator and page_to_nid() reflects only where dom0's free RAM +happened to be, not the host node of the foreign frame. Per-ring +NUMA placement under that configuration would commit to wrong nodes +confidently rather than silently no-op. Disabling the option +compiles the query out and the public helper becomes a stub returning +NUMA_NO_NODE. + +PV dom0 is not a NUMA-affinity target: the PV map path explicitly +sets host_node = NUMA_NO_NODE and the helper handles the +pv.area->addr lookup path symmetrically with hvm.addr, returning +NUMA_NO_NODE in both cases when no match is found. + +The foreign MFN is sourced from dev_bus_addr in the gnttab map +result. Xen sets that field unconditionally for host_map operations +(see grant_table.c in the hypervisor), so it is reliable in PVH dom0 +without IOMMU isolation. Sampling only the first ring page's MFN is +sufficient for the common case where a ring is contiguous on one +host node; multi-node huge-page grants are a future concern. + +Signed-off-by: Steven Noonan +--- + drivers/xen/Kconfig | 25 +++++++ + drivers/xen/xenbus/xenbus_client.c | 111 +++++++++++++++++++++++++++++ + include/xen/interface/memory.h | 26 +++++++ + include/xen/xenbus.h | 13 ++++ + 4 files changed, 175 insertions(+) + +diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig +index f9a35ed266ec..147e6acb231b 100644 +--- a/drivers/xen/Kconfig ++++ b/drivers/xen/Kconfig +@@ -96,6 +96,31 @@ config XEN_BACKEND + Support for backend device drivers that provide I/O services + to other virtual machines. + ++config XEN_BACKEND_NUMA_AFFINITY ++ bool "NUMA affinity for Xen backend drivers" ++ depends on XEN_BACKEND && NUMA && ACPI_NUMA && XEN_UNPOPULATED_ALLOC ++ default y ++ help ++ Allow Xen backend drivers (netback, blkback, gntdev consumers) ++ to discover the host NUMA node that hosts a grant-mapped ring ++ page, and to place their service threads and IRQs on that node. ++ ++ XEN_UNPOPULATED_ALLOC provides the per-node placeholder-page pool ++ the relocation logic in xenbus_map_ring_valloc() draws from. ++ Without it, placeholders come from the generic balloon allocator, ++ whose page_to_nid() reflects only where dom0's free RAM happened ++ to be -- not the host node of the foreign frame the placeholder ++ will end up backing. In that configuration the per-ring ++ placement decisions would be confidently wrong rather than just ++ absent, so the Kconfig hard-depends on XEN_UNPOPULATED_ALLOC ++ rather than silently degrading. ++ ++ Requires hypervisor support for the XENMEM_get_mfn_pxms ++ hypercall. Without that support the feature is silently a ++ no-op, equivalent to NUMA-oblivious behaviour. ++ ++ If unsure, say Y. ++ + config XENFS + tristate "Xen filesystem" + select XEN_PRIVCMD +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index 2dc874fb5506..8e0695ba39a3 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -31,15 +31,18 @@ + */ + + #include ++#include + #include + #include + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -47,6 +50,10 @@ + #include + #include + ++#ifdef CONFIG_XEN_BACKEND_NUMA_AFFINITY ++#include ++#endif ++ + #include "xenbus.h" + + #define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) +@@ -67,6 +74,7 @@ struct xenbus_map_node { + }; + grant_handle_t handles[XENBUS_MAX_RING_GRANTS]; + unsigned int nr_handles; ++ int host_node; /* Linux node id of foreign frame, or NUMA_NO_NODE */ + }; + + struct map_ring_valloc { +@@ -85,6 +93,72 @@ struct map_ring_valloc { + static DEFINE_SPINLOCK(xenbus_valloc_lock); + static LIST_HEAD(xenbus_valloc_pages); + ++#ifdef CONFIG_XEN_BACKEND_NUMA_AFFINITY ++/* ++ * Tri-state cache for XENMEM_get_mfn_pxms availability. -ENOSYS from ++ * the first attempt latches "unsupported", short-circuiting future ++ * calls. Any positive ACK (including the legitimate XEN_INVALID_NUMA_ID ++ * answer for an MFN Xen does not know about) latches "supported". ++ * ++ * Lock-free: at most one transition each direction, and "unsupported" ++ * is a stable terminal state once entered. A racing reader might ++ * issue one redundant hypercall before observing the cached state, ++ * which is harmless. ++ */ ++#define XEN_MFN_PXM_UNKNOWN 0 ++#define XEN_MFN_PXM_SUPPORTED 1 ++#define XEN_MFN_PXM_UNSUPPORTED 2 ++ ++static int xen_mfn_pxm_state = XEN_MFN_PXM_UNKNOWN; ++ ++/* ++ * Resolve one foreign MFN to a Linux node id. Returns NUMA_NO_NODE ++ * for any failure mode: hypercall unsupported, MFN unknown to Xen, ++ * PXM not registered with the dom0 ACPI namespace. ++ * ++ * Three NUMA identifier namespaces are involved here. Xen returns ++ * host PXM (firmware-supplied). pxm_to_node() translates to a Linux ++ * dom0 node id. Callers then use the result against Linux helpers ++ * like cpumask_of_node() and kthread_create_on_node(). ++ */ ++static int xenbus_query_mfn_node(unsigned long mfn) ++{ ++ struct xen_get_mfn_pxms req; ++ xen_pfn_t mfn_arg = mfn; ++ uint32_t pxm = XEN_INVALID_NUMA_ID; ++ int rc; ++ ++ if (READ_ONCE(xen_mfn_pxm_state) == XEN_MFN_PXM_UNSUPPORTED) ++ return NUMA_NO_NODE; ++ ++ memset(&req, 0, sizeof(req)); ++ set_xen_guest_handle(req.mfns, &mfn_arg); ++ set_xen_guest_handle(req.pxms, &pxm); ++ req.nr_mfns = 1; ++ ++ rc = HYPERVISOR_memory_op(XENMEM_get_mfn_pxms, &req); ++ if (rc < 0) { ++ if (rc == -ENOSYS) { ++ WRITE_ONCE(xen_mfn_pxm_state, XEN_MFN_PXM_UNSUPPORTED); ++ pr_info("xenbus: hypervisor lacks XENMEM_get_mfn_pxms, backend NUMA affinity disabled\n"); ++ } ++ return NUMA_NO_NODE; ++ } ++ ++ WRITE_ONCE(xen_mfn_pxm_state, XEN_MFN_PXM_SUPPORTED); ++ ++ if (pxm == XEN_INVALID_NUMA_ID) ++ return NUMA_NO_NODE; ++ ++ return pxm_to_node(pxm); ++} ++#else ++static int xenbus_query_mfn_node(unsigned long mfn) ++{ ++ return NUMA_NO_NODE; ++} ++#endif /* CONFIG_XEN_BACKEND_NUMA_AFFINITY */ ++ + struct xenbus_ring_ops { + int (*map)(struct xenbus_device *dev, struct map_ring_valloc *info, + grant_ref_t *gnt_refs, unsigned int nr_grefs, +@@ -678,6 +752,8 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + bool leaked = false; + unsigned int nr_pages = XENBUS_PAGES(nr_grefs); + ++ node->host_node = NUMA_NO_NODE; ++ + err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages); + if (err) + goto out_err; +@@ -693,6 +769,17 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + if (err) + goto out_free_ballooned_pages; + ++ /* ++ * Xen unconditionally fills dev_bus_addr with the foreign frame's ++ * machine address on a successful host_map (see grant_table.c in ++ * the hypervisor). Pick up the first ring page's MFN and resolve ++ * it now while we still have the map info; the result is cached on ++ * the xenbus_map_node so backends can look it up cheaply later. ++ */ ++ if (nr_grefs > 0) ++ node->host_node = xenbus_query_mfn_node( ++ PFN_DOWN(info->map[0].dev_bus_addr)); ++ + addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP, + PAGE_KERNEL); + if (!addr) { +@@ -743,6 +830,27 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) + } + EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + ++int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr) ++{ ++ struct xenbus_map_node *node; ++ int node_id = NUMA_NO_NODE; ++ ++ spin_lock(&xenbus_valloc_lock); ++ list_for_each_entry(node, &xenbus_valloc_pages, next) { ++ void *addr = xen_pv_domain() ? node->pv.area->addr ++ : node->hvm.addr; ++ ++ if (addr == vaddr) { ++ node_id = node->host_node; ++ break; ++ } ++ } ++ spin_unlock(&xenbus_valloc_lock); ++ ++ return node_id; ++} ++EXPORT_SYMBOL_GPL(xenbus_ring_host_node); ++ + #ifdef CONFIG_XEN_PV + static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) + { +@@ -763,6 +871,9 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev, + bool leaked = false; + int err = -ENOMEM; + ++ /* PV dom0 is not a NUMA-affinity target; leave the value unset. */ ++ node->host_node = NUMA_NO_NODE; ++ + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); + if (!area) + return -ENOMEM; +diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h +index 1a371a825c55..1998a12a9465 100644 +--- a/include/xen/interface/memory.h ++++ b/include/xen/interface/memory.h +@@ -325,4 +325,30 @@ struct xen_mem_acquire_resource { + }; + DEFINE_GUEST_HANDLE_STRUCT(xen_mem_acquire_resource); + ++/* ++ * XENMEM_get_mfn_pxms: resolve a batch of host MFNs to their firmware ++ * proximity-domain identifiers (host PXM on x86 ACPI). ++ * ++ * Returned values are in the host PXM namespace (the same value space ++ * dom0's own SRAT uses), not Xen's internal node id. Callers convert ++ * to a Linux node id with pxm_to_node(). Slots Xen has no node info ++ * for receive XEN_INVALID_NUMA_ID rather than failing the whole batch. ++ * ++ * Restricted to the hardware domain. On hypervisors that do not ++ * provide this op (older or non-Edera Xen, or builds without ++ * CONFIG_NUMA), the hypercall returns -ENOSYS; callers treat that as ++ * "feature unavailable" and fall back to NUMA-oblivious behaviour. ++ */ ++#define XENMEM_get_mfn_pxms 40 ++ ++#define XEN_INVALID_NUMA_ID (~(uint32_t)0) ++ ++struct xen_get_mfn_pxms { ++ GUEST_HANDLE(xen_pfn_t) mfns; ++ GUEST_HANDLE(uint32_t) pxms; ++ uint32_t nr_mfns; ++ uint32_t flags; ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_get_mfn_pxms); ++ + #endif /* __XEN_PUBLIC_MEMORY_H__ */ +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index 7dab04cf4a36..18b902bf79ef 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -225,6 +225,19 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs, + + int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr); + ++/* ++ * Return the host NUMA node (Linux node id) of the foreign frame ++ * backing the first page of a mapping previously established by ++ * xenbus_map_ring_valloc(). Returns NUMA_NO_NODE if the hypervisor ++ * cannot provide the information, the mapping is not found, or the ++ * kernel was built without CONFIG_XEN_BACKEND_NUMA_AFFINITY. ++ * ++ * Intended for backends placing their service threads and IRQs on ++ * the node hosting the ring. The value is resolved at map time and ++ * cached on the mapping, so this call is a cheap lookup. ++ */ ++int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr); ++ + int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port); + int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port); + +-- +2.54.0 + diff --git a/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch b/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch new file mode 100644 index 0000000..f5ceaeb --- /dev/null +++ b/patches/0003-xen-netback-place-per-queue-kthreads-and-IRQs-near-t.patch @@ -0,0 +1,258 @@ +From d9b521a2e9bba0940078bb2a8d8052ef6871e57f Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 18:40:35 -0700 +Subject: [PATCH 3/9] xen-netback: place per-queue kthreads and IRQs near the + ring + +Today both xenvif kthreads (guest-rx and dealloc) and the per-queue +event-channel IRQs run wherever the scheduler happened to place them, +which in PVH dom0 is typically all on one CPU regardless of the +guest's vNUMA layout. When a guest's per-queue ring lives on a +different host node from the kthread, every packet pays cross-node +interconnect cost to walk the ring and to grant-copy payload pages. + +Use xenbus_ring_host_node() to find the host node hosting the +queue's tx ring, then: + + - Create the guest-rx and dealloc kthreads with + kthread_create_on_node() so the task_struct (and kernel stack) is + allocated on the target node from the start. A bare + set_cpus_allowed_ptr() after kthread_run leaves the stack on the + caller's node and continues to pay cross-node cost on every + context switch. + - Pin each kthread to the node's cpumask with + set_cpus_allowed_ptr(). This is a hard pin; an operator can + still override with taskset. No-op when no node info is + available. + - Bind each evtchn IRQ on the ring's node using + bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node() (and the + non-handler variant for the ctrl ring), so the underlying desc is + allocated with the right node attribute and irqbalance treats the + IRQ as NUMA-local rather than floating. For the ctrl ring this + means resolving host_node before the bind instead of after. + - Steer the tx, rx, and ctrl IRQs with irq_set_affinity_and_hint(). + This writes both the actual affinity and the affinity_hint, so a + fresh boot without irqbalance routes IRQs correctly and irqbalance + agrees if it is running. Operator writes to + /proc/irq/N/smp_affinity still win. + - Clear the hint with irq_update_affinity_hint(irq, NULL) right + before each unbind_from_irqhandler() in xenvif_disconnect_queue + and xenvif_disconnect_ctrl. free_irq() warns if the hint is still + set at teardown (kernel/irq/manage.c:1865); destroying a domU + would otherwise WARN in xenwatch context. + +When xenbus_ring_host_node() returns NUMA_NO_NODE (older Xen without +the underlying hypercall, kernel built without +CONFIG_XEN_BACKEND_NUMA_AFFINITY, or a PV mapping), every NUMA-aware +step is skipped and the behaviour is byte-for-byte identical to the +previous code path. + +Signed-off-by: Steven Noonan +--- + drivers/net/xen-netback/interface.c | 102 ++++++++++++++++++++++------ + 1 file changed, 82 insertions(+), 20 deletions(-) + +diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c +index a0a438881388..798f77595529 100644 +--- a/drivers/net/xen-netback/interface.c ++++ b/drivers/net/xen-netback/interface.c +@@ -31,6 +31,7 @@ + #include "common.h" + + #include ++#include + #include + #include + #include +@@ -638,19 +639,41 @@ int xenvif_connect_ctrl(struct xenvif *vif, grant_ref_t ring_ref, + if (req_prod - rsp_prod > RING_SIZE(&vif->ctrl)) + goto err_unmap; + +- err = bind_interdomain_evtchn_to_irq_lateeoi(xendev, evtchn); +- if (err < 0) +- goto err_unmap; ++ { ++ /* ++ * Resolve the host node before binding the IRQ so the ++ * desc itself is allocated on the right node (which is ++ * what irqbalance reads from /proc/irq/N/node when ++ * deciding affinity). Steer the threaded IRQ handler ++ * toward the same node with irq_set_affinity_and_hint so ++ * a fresh boot without irqbalance also routes correctly. ++ * Operator writes to /proc/irq/N/smp_affinity still win. ++ */ ++ int node = xenbus_ring_host_node(xendev, vif->ctrl.sring); + +- vif->ctrl_irq = err; ++ err = bind_interdomain_evtchn_to_irq_lateeoi_on_node(xendev, ++ evtchn, ++ node); ++ if (err < 0) ++ goto err_unmap; + +- xenvif_init_hash(vif); ++ vif->ctrl_irq = err; + +- err = request_threaded_irq(vif->ctrl_irq, NULL, xenvif_ctrl_irq_fn, +- IRQF_ONESHOT, "xen-netback-ctrl", vif); +- if (err) { +- pr_warn("Could not setup irq handler for %s\n", dev->name); +- goto err_deinit; ++ xenvif_init_hash(vif); ++ ++ err = request_threaded_irq(vif->ctrl_irq, NULL, ++ xenvif_ctrl_irq_fn, ++ IRQF_ONESHOT, ++ "xen-netback-ctrl", vif); ++ if (err) { ++ pr_warn("Could not setup irq handler for %s\n", ++ dev->name); ++ goto err_deinit; ++ } ++ ++ if (node != NUMA_NO_NODE) ++ irq_set_affinity_and_hint(vif->ctrl_irq, ++ cpumask_of_node(node)); + } + + return 0; +@@ -686,6 +709,11 @@ static void xenvif_disconnect_queue(struct xenvif_queue *queue) + } + + if (queue->tx_irq) { ++ /* ++ * free_irq() warns if affinity_hint is still set. Drop the ++ * hint installed at connect time before tearing the IRQ down. ++ */ ++ irq_update_affinity_hint(queue->tx_irq, NULL); + unbind_from_irqhandler(queue->tx_irq, queue); + if (queue->tx_irq == queue->rx_irq) + queue->rx_irq = 0; +@@ -693,6 +721,7 @@ static void xenvif_disconnect_queue(struct xenvif_queue *queue) + } + + if (queue->rx_irq) { ++ irq_update_affinity_hint(queue->rx_irq, NULL); + unbind_from_irqhandler(queue->rx_irq, queue); + queue->rx_irq = 0; + } +@@ -708,6 +737,7 @@ int xenvif_connect_data(struct xenvif_queue *queue, + { + struct xenbus_device *dev = xenvif_to_xenbus_device(queue->vif); + struct task_struct *task; ++ int ring_node; + int err; + + BUG_ON(queue->tx_irq); +@@ -719,6 +749,16 @@ int xenvif_connect_data(struct xenvif_queue *queue, + if (err < 0) + goto err; + ++ /* ++ * Place the per-queue kthreads and IRQs on the host node hosting ++ * the ring page. Most of the per-packet work touches the ring; ++ * keeping the worker local cuts cross-node interconnect traffic. ++ * Returns NUMA_NO_NODE on hypervisors without XENMEM_get_mfn_pxms ++ * or on a kernel built without CONFIG_XEN_BACKEND_NUMA_AFFINITY, ++ * in which case the code below falls back to today's behaviour. ++ */ ++ ring_node = xenbus_ring_host_node(dev, queue->tx.sring); ++ + init_waitqueue_head(&queue->wq); + init_waitqueue_head(&queue->dealloc_wq); + atomic_set(&queue->inflight_packets, 0); +@@ -727,8 +767,14 @@ int xenvif_connect_data(struct xenvif_queue *queue, + + queue->stalled = true; + +- task = kthread_run(xenvif_kthread_guest_rx, queue, +- "%s-guest-rx", queue->name); ++ /* ++ * Split kthread create + wake so the task_struct (and its kernel ++ * stack) is allocated on the target node from the start. A bare ++ * set_cpus_allowed_ptr after kthread_run leaves the stack on the ++ * caller's node. ++ */ ++ task = kthread_create_on_node(xenvif_kthread_guest_rx, queue, ++ ring_node, "%s-guest-rx", queue->name); + if (IS_ERR(task)) + goto kthread_err; + queue->task = task; +@@ -737,43 +783,58 @@ int xenvif_connect_data(struct xenvif_queue *queue, + * if the thread function returns before kthread_stop is called. + */ + get_task_struct(task); ++ if (ring_node != NUMA_NO_NODE) ++ set_cpus_allowed_ptr(task, cpumask_of_node(ring_node)); ++ wake_up_process(task); + +- task = kthread_run(xenvif_dealloc_kthread, queue, +- "%s-dealloc", queue->name); ++ task = kthread_create_on_node(xenvif_dealloc_kthread, queue, ++ ring_node, "%s-dealloc", queue->name); + if (IS_ERR(task)) + goto kthread_err; + queue->dealloc_task = task; ++ if (ring_node != NUMA_NO_NODE) ++ set_cpus_allowed_ptr(task, cpumask_of_node(ring_node)); ++ wake_up_process(task); + + if (tx_evtchn == rx_evtchn) { + /* feature-split-event-channels == 0 */ +- err = bind_interdomain_evtchn_to_irqhandler_lateeoi( ++ err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( + dev, tx_evtchn, xenvif_interrupt, 0, +- queue->name, queue); ++ queue->name, queue, ring_node); + if (err < 0) + goto err; + queue->tx_irq = queue->rx_irq = err; + disable_irq(queue->tx_irq); ++ if (ring_node != NUMA_NO_NODE) ++ irq_set_affinity_and_hint(queue->tx_irq, ++ cpumask_of_node(ring_node)); + } else { + /* feature-split-event-channels == 1 */ + snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name), + "%s-tx", queue->name); +- err = bind_interdomain_evtchn_to_irqhandler_lateeoi( ++ err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( + dev, tx_evtchn, xenvif_tx_interrupt, 0, +- queue->tx_irq_name, queue); ++ queue->tx_irq_name, queue, ring_node); + if (err < 0) + goto err; + queue->tx_irq = err; + disable_irq(queue->tx_irq); ++ if (ring_node != NUMA_NO_NODE) ++ irq_set_affinity_and_hint(queue->tx_irq, ++ cpumask_of_node(ring_node)); + + snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name), + "%s-rx", queue->name); +- err = bind_interdomain_evtchn_to_irqhandler_lateeoi( ++ err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( + dev, rx_evtchn, xenvif_rx_interrupt, 0, +- queue->rx_irq_name, queue); ++ queue->rx_irq_name, queue, ring_node); + if (err < 0) + goto err; + queue->rx_irq = err; + disable_irq(queue->rx_irq); ++ if (ring_node != NUMA_NO_NODE) ++ irq_set_affinity_and_hint(queue->rx_irq, ++ cpumask_of_node(ring_node)); + } + + return 0; +@@ -820,6 +881,7 @@ void xenvif_disconnect_ctrl(struct xenvif *vif) + { + if (vif->ctrl_irq) { + xenvif_deinit_hash(vif); ++ irq_update_affinity_hint(vif->ctrl_irq, NULL); + unbind_from_irqhandler(vif->ctrl_irq, vif); + vif->ctrl_irq = 0; + } +-- +2.54.0 + diff --git a/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch b/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch new file mode 100644 index 0000000..38ddf46 --- /dev/null +++ b/patches/0004-xen-blkback-place-per-ring-kthread-and-IRQ-near-the-.patch @@ -0,0 +1,182 @@ +From dfc8f5b06ce2855798e26f1221a7c176b6ae9cd1 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 18:50:03 -0700 +Subject: [PATCH 4/9] xen-blkback: place per-ring kthread and IRQ near the ring + +The xenblkd kthread and per-ring event-channel IRQ today run wherever +the scheduler happens to place them, which in PVH dom0 is typically +all on one CPU regardless of the guest's vNUMA layout. When the +guest's per-ring buffer lives on a different host node from the +worker, every request pays cross-node interconnect cost to walk the +ring and to grant-copy payload pages. + +The blkback flow is split across two call sites: xen_blkif_map() maps +the ring and binds the event channel, while xen_update_blkif_status() +spawns the xenblkd kthread later. Stash the ring's host node on the +xen_blkif_ring at map time so the kthread creation site can pick it +up without re-querying. + +At map time: + - Call xenbus_ring_host_node() right after xenbus_map_ring_valloc() + succeeds. Store the result in ring->host_node. + - Bind the IRQ on the ring's node using + bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node() so the + underlying desc is allocated with the right node attribute and + irqbalance treats the IRQ as NUMA-local. + - Steer the bound IRQ with irq_set_affinity_and_hint() so a fresh + boot without irqbalance routes correctly and irqbalance agrees if + it is running. Operator writes still win. + +At kthread creation time: + - Use kthread_create_on_node() with ring->host_node so the + task_struct (and kernel stack) is allocated on the target node + from the start. A bare set_cpus_allowed_ptr() after kthread_run + would leave the stack on the caller's node and pay cross-node + cost on every context switch. + - Hard-pin the kthread to the node's cpumask with + set_cpus_allowed_ptr(). An operator can still override with + taskset. + +At disconnect time: + - Clear the hint with irq_update_affinity_hint(ring->irq, NULL) + right before each unbind_from_irqhandler(). free_irq() warns if + the hint is still set at teardown (kernel/irq/manage.c:1865); + destroying a domU would otherwise WARN in xenwatch context. + +ring->host_node is initialised to NUMA_NO_NODE in xen_blkif_alloc_rings +so any path that misses the map step (or runs on a hypervisor without +XENMEM_get_mfn_pxms, or a kernel without CONFIG_XEN_BACKEND_NUMA_AFFINITY) +falls back to today's NUMA-oblivious behaviour without further checks. + +Signed-off-by: Steven Noonan +--- + drivers/block/xen-blkback/common.h | 7 +++++ + drivers/block/xen-blkback/xenbus.c | 48 ++++++++++++++++++++++++++++-- + 2 files changed, 52 insertions(+), 3 deletions(-) + +diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h +index b427d54bc120..d670536ebb9e 100644 +--- a/drivers/block/xen-blkback/common.h ++++ b/drivers/block/xen-blkback/common.h +@@ -297,6 +297,13 @@ struct xen_blkif_ring { + /* Thread shutdown wait queue. */ + wait_queue_head_t shutdown_wq; + struct xen_blkif *blkif; ++ ++ /* ++ * Linux node id of the host NUMA node hosting blk_ring, or ++ * NUMA_NO_NODE if unknown. Resolved at xen_blkif_map() time, ++ * consumed when starting the per-ring xenblkd kthread. ++ */ ++ int host_node; + }; + + struct xen_blkif { +diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c +index 0621878940ae..7ded88e97472 100644 +--- a/drivers/block/xen-blkback/xenbus.c ++++ b/drivers/block/xen-blkback/xenbus.c +@@ -10,6 +10,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -108,7 +109,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) + + for (i = 0; i < blkif->nr_rings; i++) { + ring = &blkif->rings[i]; +- ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i); ++ /* ++ * Allocate the task_struct (including its kernel stack) on ++ * the node hosting the ring so that subsequent kernel-mode ++ * accesses on this thread stay local. set_cpus_allowed_ptr ++ * after kthread_create_on_node hard-pins to that node's CPUs; ++ * an operator may still override with taskset. NUMA_NO_NODE ++ * leaves placement to the default scheduler. ++ */ ++ ring->xenblkd = kthread_create_on_node(xen_blkif_schedule, ring, ++ ring->host_node, ++ "%s-%d", name, i); + if (IS_ERR(ring->xenblkd)) { + err = PTR_ERR(ring->xenblkd); + ring->xenblkd = NULL; +@@ -116,6 +127,10 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) + "start %s-%d xenblkd", name, i); + goto out; + } ++ if (ring->host_node != NUMA_NO_NODE) ++ set_cpus_allowed_ptr(ring->xenblkd, ++ cpumask_of_node(ring->host_node)); ++ wake_up_process(ring->xenblkd); + } + return; + +@@ -150,6 +165,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); + ring->blkif = blkif; ++ ring->host_node = NUMA_NO_NODE; + ring->st_print = jiffies; + ring->active = true; + } +@@ -207,6 +223,15 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, + if (err < 0) + return err; + ++ /* ++ * Stash the host node now while the mapping is fresh. The ++ * xenblkd kthread is created later from xen_update_blkif_status() ++ * and consumes this value to place the worker on the node that ++ * owns the ring. NUMA_NO_NODE leaves placement to the default ++ * scheduler. ++ */ ++ ring->host_node = xenbus_ring_host_node(blkif->be->dev, ring->blk_ring); ++ + sring_common = (struct blkif_common_sring *)ring->blk_ring; + rsp_prod = READ_ONCE(sring_common->rsp_prod); + req_prod = READ_ONCE(sring_common->req_prod); +@@ -250,12 +275,23 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, + if (req_prod - rsp_prod > size) + goto fail; + +- err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->be->dev, +- evtchn, xen_blkif_be_int, 0, "blkif-backend", ring); ++ err = bind_interdomain_evtchn_to_irqhandler_lateeoi_on_node( ++ blkif->be->dev, evtchn, xen_blkif_be_int, 0, ++ "blkif-backend", ring, ring->host_node); + if (err < 0) + goto fail; + ring->irq = err; + ++ /* ++ * Route the event channel toward the ring's host node. Writes ++ * both the actual affinity (relied on at boot when irqbalance is ++ * absent) and the hint (so irqbalance agrees if it is running). ++ * Operator writes to /proc/irq/N/smp_affinity still win. ++ */ ++ if (ring->host_node != NUMA_NO_NODE) ++ irq_set_affinity_and_hint(ring->irq, ++ cpumask_of_node(ring->host_node)); ++ + return 0; + + fail: +@@ -293,6 +329,12 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) + } + + if (ring->irq) { ++ /* ++ * free_irq() warns if affinity_hint is still set. ++ * Drop the hint installed at map time before tearing ++ * the IRQ down. ++ */ ++ irq_update_affinity_hint(ring->irq, NULL); + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; + } +-- +2.54.0 + diff --git a/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch b/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch new file mode 100644 index 0000000..d179c44 --- /dev/null +++ b/patches/0005-xen-make-xen_alloc_unpopulated_pages-NUMA-aware.patch @@ -0,0 +1,358 @@ +From aa8e71b14c040931222a36998a789e4ce3484c70 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 18:56:34 -0700 +Subject: [PATCH 5/9] xen: make xen_alloc_unpopulated_pages NUMA-aware + +xen_alloc_unpopulated_pages today hands out ZONE_DEVICE placeholder +pages from a single global free list, with the underlying section +registered via memremap_pages(pgmap, NUMA_NO_NODE). page_to_nid() +of any such page reports NUMA_NO_NODE, so every consumer of +foreign-mapped grant pages -- xenbus rings, gntdev mmaps, xlate_mmu, +privcmd, xen-drm -- has no useful node information for kernel code +that consults page_to_nid (slab placement decisions, autonuma, +numastat, various softirq routing heuristics). + +Partition the free list by Linux node id and register each +section-aligned IOMEM allocation against a specific node. Pages +drawn from page_list[N] then report page_to_nid() == N by +construction. Callers that know the host node of the foreign frame +they intend to grant-map can request from the matching pool; the +foreign frame's node and the placeholder page's nid then agree. + +Two exported entry points: + + xen_alloc_unpopulated_pages_node(nr, pages, node) -- new, takes an + explicit node preference. NUMA_NO_NODE and out-of-range values + are clamped to numa_node_id() at the boundary; internal code only + ever indexes into a valid slot. + + xen_alloc_unpopulated_pages(nr, pages) -- existing API, now a + wrapper that passes numa_node_id(). Strictly an improvement over + NUMA_NO_NODE for non-xenbus callers (gntdev, xlate_mmu, privcmd, + drm) that have no node hint of their own: the caller's local node + is at least as good a guess as none. + +xen_free_unpopulated_pages routes each page back to the pool of the +node it came from via page_to_nid(). No new arguments; existing +callers do not change. + +A single mutex still covers all per-node lists. Allocation and free +are connect-time events on I/O backends and on map_grant_ref ioctls, +not contention-sensitive paths. + +MAX_NUMNODES sizes the two arrays; on CONFIG_NUMA=n that is 1 and the +behaviour is byte-for-byte equivalent to the previous single-list +implementation. + +Wire xenbus_map_ring_hvm() through the new API. Placeholders are +first drawn from numa_node_id() because the foreign MFN is not known +before the grant_map. After grant_map succeeds and the first page's +host node is known via XENMEM_get_mfn_pxms, the placeholders are +relocated: unmap, free, re-allocate on the target node, re-map. When +the initial allocation already landed on the right node the relocate +is skipped. + +Signed-off-by: Steven Noonan +--- + drivers/xen/unpopulated-alloc.c | 103 +++++++++++++++++++++++------ + drivers/xen/xenbus/xenbus_client.c | 60 +++++++++++++++++ + include/xen/xen.h | 7 ++ + 3 files changed, 149 insertions(+), 21 deletions(-) + +diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c +index 1dc0b495c8e5..ea9c76895459 100644 +--- a/drivers/xen/unpopulated-alloc.c ++++ b/drivers/xen/unpopulated-alloc.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -12,12 +13,31 @@ + #include + #include + ++/* ++ * Free pages are kept on per-node lists indexed by Linux node id. Each ++ * fill_list() call grabs a fresh section-aligned IOMEM region and ++ * registers it with memremap_pages() against a specific node, so all ++ * struct pages in that section report that node via page_to_nid(). ++ * Backends that learn the host node of a foreign frame (via ++ * xenbus_ring_host_node) can request placeholder pages from the ++ * matching pool so page_to_nid agrees with the actual host placement. ++ * ++ * A single mutex covers all per-node lists. Alloc/free are ++ * connect-time events on the I/O backends and not contention-sensitive. ++ */ + static DEFINE_MUTEX(list_lock); +-static struct page *page_list; +-static unsigned int list_count; ++static struct page *page_list[MAX_NUMNODES]; ++static unsigned int list_count[MAX_NUMNODES]; + + static struct resource *target_resource; + ++static int xen_unpopulated_clamp_node(int node) ++{ ++ if (node == NUMA_NO_NODE || node < 0 || node >= MAX_NUMNODES) ++ return numa_node_id(); ++ return node; ++} ++ + /* Pages to subtract from the memory count when setting balloon target. */ + unsigned long xen_unpopulated_pages __initdata; + +@@ -34,7 +54,7 @@ int __weak __init arch_xen_unpopulated_init(struct resource **res) + return 0; + } + +-static int fill_list(unsigned int nr_pages) ++static int fill_list(unsigned int nr_pages, int node) + { + struct dev_pagemap *pgmap; + struct resource *res, *tmp_res = NULL; +@@ -121,7 +141,15 @@ static int fill_list(unsigned int nr_pages) + } + #endif + +- vaddr = memremap_pages(pgmap, NUMA_NO_NODE); ++ /* ++ * Register the section against @node so page_to_nid() of any ++ * page in this section reports that value. Grant-map operations ++ * later install foreign MFNs into these slots; as long as the ++ * caller picks the section matching the foreign MFN's host node ++ * (which is the contract callers of xen_alloc_unpopulated_pages_node ++ * are expected to honour), page_to_nid is correct by construction. ++ */ ++ vaddr = memremap_pages(pgmap, node); + if (IS_ERR(vaddr)) { + pr_err("Cannot remap memory range\n"); + ret = PTR_ERR(vaddr); +@@ -131,9 +159,9 @@ static int fill_list(unsigned int nr_pages) + for (i = 0; i < alloc_pages; i++) { + struct page *pg = virt_to_page(vaddr + PAGE_SIZE * i); + +- pg->zone_device_data = page_list; +- page_list = pg; +- list_count++; ++ pg->zone_device_data = page_list[node]; ++ page_list[node] = pg; ++ list_count[node]++; + } + + return 0; +@@ -153,12 +181,21 @@ static int fill_list(unsigned int nr_pages) + } + + /** +- * xen_alloc_unpopulated_pages - alloc unpopulated pages ++ * xen_alloc_unpopulated_pages_node - alloc unpopulated pages on a node + * @nr_pages: Number of pages + * @pages: pages returned +- * @return 0 on success, error otherwise ++ * @node: Preferred Linux node id, or NUMA_NO_NODE for current CPU's node ++ * ++ * The returned pages are drawn from a per-node pool registered with ++ * memremap_pages() against @node, so page_to_nid() reports @node for ++ * every returned page. Callers that know the host node of a foreign ++ * frame should pass it here to keep page_to_nid in agreement with the ++ * actual host placement after a subsequent grant-map. ++ * ++ * Returns 0 on success, error otherwise. + */ +-int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) ++int xen_alloc_unpopulated_pages_node(unsigned int nr_pages, struct page **pages, ++ int node) + { + unsigned int i; + int ret = 0; +@@ -171,19 +208,21 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) + if (!target_resource) + return xen_alloc_ballooned_pages(nr_pages, pages); + ++ node = xen_unpopulated_clamp_node(node); ++ + mutex_lock(&list_lock); +- if (list_count < nr_pages) { +- ret = fill_list(nr_pages - list_count); ++ if (list_count[node] < nr_pages) { ++ ret = fill_list(nr_pages - list_count[node], node); + if (ret) + goto out; + } + + for (i = 0; i < nr_pages; i++) { +- struct page *pg = page_list; ++ struct page *pg = page_list[node]; + + BUG_ON(!pg); +- page_list = pg->zone_device_data; +- list_count--; ++ page_list[node] = pg->zone_device_data; ++ list_count[node]--; + pages[i] = pg; + + #ifdef CONFIG_XEN_HAVE_PVMMU +@@ -193,9 +232,9 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) + unsigned int j; + + for (j = 0; j <= i; j++) { +- pages[j]->zone_device_data = page_list; +- page_list = pages[j]; +- list_count++; ++ pages[j]->zone_device_data = page_list[node]; ++ page_list[node] = pages[j]; ++ list_count[node]++; + } + goto out; + } +@@ -207,12 +246,32 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) + mutex_unlock(&list_lock); + return ret; + } ++EXPORT_SYMBOL(xen_alloc_unpopulated_pages_node); ++ ++/** ++ * xen_alloc_unpopulated_pages - alloc unpopulated pages ++ * @nr_pages: Number of pages ++ * @pages: pages returned ++ * @return 0 on success, error otherwise ++ * ++ * Equivalent to xen_alloc_unpopulated_pages_node() with the current ++ * CPU's node as the preference. ++ */ ++int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) ++{ ++ return xen_alloc_unpopulated_pages_node(nr_pages, pages, numa_node_id()); ++} + EXPORT_SYMBOL(xen_alloc_unpopulated_pages); + + /** + * xen_free_unpopulated_pages - return unpopulated pages + * @nr_pages: Number of pages + * @pages: pages to return ++ * ++ * Each page returns to the pool of the node it was originally allocated ++ * from, identified via page_to_nid(). Sections registered to a ++ * specific node yield pages whose nid reflects that node, so freed ++ * pages naturally land back in the matching list. + */ + void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages) + { +@@ -225,9 +284,11 @@ void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages) + + mutex_lock(&list_lock); + for (i = 0; i < nr_pages; i++) { +- pages[i]->zone_device_data = page_list; +- page_list = pages[i]; +- list_count++; ++ int node = xen_unpopulated_clamp_node(page_to_nid(pages[i])); ++ ++ pages[i]->zone_device_data = page_list[node]; ++ page_list[node] = pages[i]; ++ list_count[node]++; + } + mutex_unlock(&list_lock); + } +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index 8e0695ba39a3..7f94b135727a 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -780,6 +780,66 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + node->host_node = xenbus_query_mfn_node( + PFN_DOWN(info->map[0].dev_bus_addr)); + ++ /* ++ * Placeholder pages came from numa_node_id()'s pool, which only ++ * matches the foreign frame's node by coincidence. If they ++ * disagree, drop the mapping, return the placeholders, and redo ++ * the map with placeholders drawn from the correct pool. After ++ * this, page_to_nid() of every ring page equals the host node of ++ * its foreign MFN by construction, which keeps grant-mapped pages ++ * truthful to every NUMA-aware code path that consults page_to_nid. ++ * ++ * The cost is one extra grant unmap + map pair per backend ++ * connect (a rare event) and is paid only when the placeholder ++ * pool's node disagrees with the foreign frame. PV mappings and ++ * cases where Xen cannot supply node info skip the dance entirely ++ * (host_node stays NUMA_NO_NODE). ++ */ ++ if (node->host_node != NUMA_NO_NODE && ++ page_to_nid(node->hvm.pages[0]) != node->host_node) { ++ int relocate_err; ++ ++ relocate_err = xenbus_unmap_ring(dev, node->handles, nr_grefs, ++ info->addrs); ++ if (relocate_err != GNTST_okay) { ++ /* ++ * Partial unmap: at least one grant may still be ++ * live against a placeholder we can no longer ++ * reach safely. Mark the pages leaked and fail ++ * the whole map. ++ */ ++ leaked = true; ++ err = -EIO; ++ goto out_free_ballooned_pages; ++ } ++ ++ xen_free_unpopulated_pages(nr_pages, node->hvm.pages); ++ ++ err = xen_alloc_unpopulated_pages_node(nr_pages, ++ node->hvm.pages, ++ node->host_node); ++ if (err) { ++ /* ++ * Pages already gone; clear the array so the ++ * cleanup path does not try to free them again. ++ */ ++ memset(node->hvm.pages, 0, ++ nr_pages * sizeof(*node->hvm.pages)); ++ node->nr_handles = 0; ++ goto out_err; ++ } ++ ++ info->idx = 0; ++ gnttab_foreach_grant(node->hvm.pages, nr_grefs, ++ xenbus_map_ring_setup_grant_hvm, ++ info); ++ ++ err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, ++ info, GNTMAP_host_map, &leaked); ++ if (err) ++ goto out_free_ballooned_pages; ++ } ++ + addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP, + PAGE_KERNEL); + if (!addr) { +diff --git a/include/xen/xen.h b/include/xen/xen.h +index f280c5dcf923..f38cb138d837 100644 +--- a/include/xen/xen.h ++++ b/include/xen/xen.h +@@ -70,6 +70,8 @@ extern u64 xen_saved_max_mem_size; + + #ifdef CONFIG_XEN_UNPOPULATED_ALLOC + extern unsigned long xen_unpopulated_pages; ++int xen_alloc_unpopulated_pages_node(unsigned int nr_pages, struct page **pages, ++ int node); + int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages); + void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages); + #include +@@ -77,6 +79,11 @@ int arch_xen_unpopulated_init(struct resource **res); + #else + #define xen_unpopulated_pages 0UL + #include ++static inline int xen_alloc_unpopulated_pages_node(unsigned int nr_pages, ++ struct page **pages, int node) ++{ ++ return xen_alloc_ballooned_pages(nr_pages, pages); ++} + static inline int xen_alloc_unpopulated_pages(unsigned int nr_pages, + struct page **pages) + { +-- +2.54.0 + diff --git a/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch b/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch new file mode 100644 index 0000000..d872882 --- /dev/null +++ b/patches/0006-xen-xenbus-collapse-xenbus_ring_host_node-to-a-page_.patch @@ -0,0 +1,158 @@ +From 14a4b2b5f12fa8359b84e48ec01586285d1f71e9 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 19:09:31 -0700 +Subject: [PATCH 6/9] xen/xenbus: collapse xenbus_ring_host_node to a + page_to_nid wrapper + +Now that xen_alloc_unpopulated_pages places ring placeholders on the +foreign frame's host node and xenbus_map_ring_hvm relocates the +placeholders post-map when they land in the wrong pool, page_to_nid +of any vmap'd ring page in PVH dom0 already reports the host node of +its foreign MFN. xenbus_ring_host_node can therefore drop the +list-walk + cached integer + hypercall plumbing and become a thin +wrapper around vmalloc_to_page() + page_to_nid(). + +The XENMEM_get_mfn_pxms hypercall is still issued at map time inside +xenbus_map_ring_hvm to learn the foreign frame's host node before +deciding whether to relocate. The decision now uses a local variable +rather than caching the value on struct xenbus_map_node, so the +host_node field is removed entirely. + +PV dom0 keeps the early NUMA_NO_NODE return: PV mappings install +foreign MFNs directly in the PTEs and have no struct page in dom0's +mem_map for the foreign frame, so vmalloc_to_page on a PV ring vaddr +returns the placeholder for the wrong frame (or NULL). PV dom0 is +not a NUMA-affinity target on Edera in any case. + +Backend call sites do not change: netback and blkback continue to +call xenbus_ring_host_node exactly as before. The work moves from +"walk a list keyed on vaddr, return a cached integer set at map time" +to "look up the struct page for vaddr, return its nid". Cheaper at +runtime; no longer needs xenbus_valloc_lock; same return value. + +Signed-off-by: Steven Noonan +--- + drivers/xen/xenbus/xenbus_client.c | 55 ++++++++++++++---------------- + 1 file changed, 25 insertions(+), 30 deletions(-) + +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index 7f94b135727a..f357c4a0372f 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -74,7 +74,6 @@ struct xenbus_map_node { + }; + grant_handle_t handles[XENBUS_MAX_RING_GRANTS]; + unsigned int nr_handles; +- int host_node; /* Linux node id of foreign frame, or NUMA_NO_NODE */ + }; + + struct map_ring_valloc { +@@ -748,12 +747,11 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + { + struct xenbus_map_node *node = info->node; + int err; ++ int host_node = NUMA_NO_NODE; + void *addr; + bool leaked = false; + unsigned int nr_pages = XENBUS_PAGES(nr_grefs); + +- node->host_node = NUMA_NO_NODE; +- + err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages); + if (err) + goto out_err; +@@ -770,14 +768,13 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + goto out_free_ballooned_pages; + + /* +- * Xen unconditionally fills dev_bus_addr with the foreign frame's +- * machine address on a successful host_map (see grant_table.c in +- * the hypervisor). Pick up the first ring page's MFN and resolve +- * it now while we still have the map info; the result is cached on +- * the xenbus_map_node so backends can look it up cheaply later. ++ * Xen fills dev_bus_addr with the foreign frame's machine ++ * address on a successful host_map (see grant_table.c in the ++ * hypervisor). Resolve the host node now so we know whether the ++ * placeholders need to be relocated below. + */ + if (nr_grefs > 0) +- node->host_node = xenbus_query_mfn_node( ++ host_node = xenbus_query_mfn_node( + PFN_DOWN(info->map[0].dev_bus_addr)); + + /* +@@ -792,11 +789,10 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + * The cost is one extra grant unmap + map pair per backend + * connect (a rare event) and is paid only when the placeholder + * pool's node disagrees with the foreign frame. PV mappings and +- * cases where Xen cannot supply node info skip the dance entirely +- * (host_node stays NUMA_NO_NODE). ++ * cases where Xen cannot supply node info skip the dance entirely. + */ +- if (node->host_node != NUMA_NO_NODE && +- page_to_nid(node->hvm.pages[0]) != node->host_node) { ++ if (host_node != NUMA_NO_NODE && ++ page_to_nid(node->hvm.pages[0]) != host_node) { + int relocate_err; + + relocate_err = xenbus_unmap_ring(dev, node->handles, nr_grefs, +@@ -817,7 +813,7 @@ static int xenbus_map_ring_hvm(struct xenbus_device *dev, + + err = xen_alloc_unpopulated_pages_node(nr_pages, + node->hvm.pages, +- node->host_node); ++ host_node); + if (err) { + /* + * Pages already gone; clear the array so the +@@ -892,22 +888,24 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + + int xenbus_ring_host_node(struct xenbus_device *dev, void *vaddr) + { +- struct xenbus_map_node *node; +- int node_id = NUMA_NO_NODE; ++ struct page *page; + +- spin_lock(&xenbus_valloc_lock); +- list_for_each_entry(node, &xenbus_valloc_pages, next) { +- void *addr = xen_pv_domain() ? node->pv.area->addr +- : node->hvm.addr; ++ /* ++ * PV mappings install foreign MFNs directly in the PTEs and have ++ * no struct page in dom0's mem_map for the foreign frame. PVH ++ * dom0 keeps a placeholder struct page (allocated from the ++ * matching per-node pool of xen_alloc_unpopulated_pages_node) ++ * whose page_to_nid() reports the host node of the foreign frame ++ * by construction. ++ */ ++ if (xen_pv_domain()) ++ return NUMA_NO_NODE; + +- if (addr == vaddr) { +- node_id = node->host_node; +- break; +- } +- } +- spin_unlock(&xenbus_valloc_lock); ++ page = vmalloc_to_page(vaddr); ++ if (!page) ++ return NUMA_NO_NODE; + +- return node_id; ++ return page_to_nid(page); + } + EXPORT_SYMBOL_GPL(xenbus_ring_host_node); + +@@ -931,9 +929,6 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev, + bool leaked = false; + int err = -ENOMEM; + +- /* PV dom0 is not a NUMA-affinity target; leave the value unset. */ +- node->host_node = NUMA_NO_NODE; +- + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); + if (!area) + return -ENOMEM; +-- +2.54.0 + diff --git a/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch b/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch new file mode 100644 index 0000000..188df71 --- /dev/null +++ b/patches/0007-xen-xenbus-add-xenbus_setup_ring_node-for-per-node-r.patch @@ -0,0 +1,217 @@ +From 150666b04dd7ebdef6641d9a09f0d90bfbf801aa Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 19:17:39 -0700 +Subject: [PATCH 7/9] xen/xenbus: add xenbus_setup_ring_node for per-node ring + allocation + +Frontend drivers today route every ring allocation through +xenbus_setup_ring, which calls alloc_pages_exact with no node +preference. The result is that every PV ring -- across all queues +of all multi-queue devices on a guest -- ends up on whichever node +the xenbus watch handler happens to run on, typically a single fixed +value at boot. Multi-queue devices on multi-vnode guests therefore +defeat the dom0 backend's per-queue NUMA affinity work: all of dom0's +backend kthreads cluster on one host node because all of the guest's +rings live on one guest node, which maps to one host node. + +Add a node-aware variant. xenbus_setup_ring_node(dev, gfp, node, ...) +takes a Linux node id and draws the ring pages from that node's buddy +list. xenbus_setup_ring() is now a thin wrapper that passes +NUMA_NO_NODE, preserving existing behaviour for every caller until +they opt in. + +The same-ring locality property is preserved by construction. A +single buddy allocation comes from a single node's free list, so all +pages of one ring remain on one node regardless of which node was +requested. The new variant only changes which node that is. + +alloc_pages_exact_nid is __meminit-restricted and not exported, so +the body cannot just delegate to it. Use alloc_pages_node() (which +is exported and runtime-safe) to get an order-N block on the target +node, then split_page() it so every subpage carries an independent +refcount, and free any tail pages beyond ring_size back to the +allocator. + +Also add xenbus_node_for_queue(index), a helper that rotates over the +set of nodes with online CPUs. Frontend callers feed it a per-queue +or per-ring index to pick the node they pass to +xenbus_setup_ring_node. The natural shape is +cpumask_local_spread(i, NUMA_NO_NODE), but with a NUMA_NO_NODE node +argument that falls back to a linear walk of cpu_online_mask (see +sched_numa_find_nth_cpu) and collapses every queue onto the first +node's CPUs. Going through num_node_state(N_CPU) and +for_each_node_state(node, N_CPU) actually rotates. Living in +xenbus_client.c rather than open-coded in each driver lets future +scsiback / pvcalls-back style frontends pick it up for free. + +Signed-off-by: Steven Noonan +--- + drivers/xen/xenbus/xenbus_client.c | 105 ++++++++++++++++++++++++++--- + include/xen/xenbus.h | 4 ++ + 2 files changed, 99 insertions(+), 10 deletions(-) + +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index f357c4a0372f..7419a8183903 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -31,6 +31,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -448,33 +449,103 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, + } + + /* +- * xenbus_setup_ring ++ * xenbus_setup_ring_node + * @dev: xenbus device ++ * @gfp: GFP flags for the allocation ++ * @node: preferred Linux node id for the ring pages, or NUMA_NO_NODE + * @vaddr: pointer to starting virtual address of the ring + * @nr_pages: number of pages to be granted + * @grefs: grant reference array to be filled in + * +- * Allocate physically contiguous pages for a shared ring buffer and grant it +- * to the peer of the given device. The ring buffer is initially filled with +- * zeroes. The virtual address of the ring is stored at @vaddr and the +- * grant references are stored in the @grefs array. In case of error @vaddr +- * will be set to NULL and @grefs will be filled with INVALID_GRANT_REF. ++ * Same contract as xenbus_setup_ring(), but the ring pages are drawn ++ * from @node's buddy free list when possible (subject to fallback when ++ * @node has no available memory). All pages of a single ring come ++ * from one buddy allocation so they remain on a single node by ++ * construction, which is the property frontends rely on to keep ++ * per-queue rings on per-queue nodes. ++ * ++ * The ring buffer is initially filled with zeroes. The virtual address ++ * of the ring is stored at @vaddr and the grant references are stored ++ * in the @grefs array. In case of error @vaddr will be set to NULL and ++ * @grefs will be filled with INVALID_GRANT_REF. + */ +-int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, +- unsigned int nr_pages, grant_ref_t *grefs) ++/* ++ * Pick a Linux node id from the set of nodes with online CPUs, cycling ++ * by @index. Frontends use this to distribute per-queue rings across ++ * guest NUMA nodes so the dom0 backend's per-ring placement lands them ++ * on distinct host nodes. ++ * ++ * cpumask_local_spread(i, NUMA_NO_NODE) is the natural shape this code ++ * wants, but with a NUMA_NO_NODE node argument it falls back to a ++ * straight linear walk of cpu_online_mask (see sched_numa_find_nth_cpu) ++ * which collapses every queue onto the first node's CPUs. This helper ++ * actually rotates over nodes. ++ */ ++int xenbus_node_for_queue(unsigned int index) ++{ ++ unsigned int idx = 0; ++ unsigned int n; ++ int node; ++ ++ n = num_node_state(N_CPU); ++ if (n == 0) ++ return NUMA_NO_NODE; ++ ++ index %= n; ++ for_each_node_state(node, N_CPU) { ++ if (idx == index) ++ return node; ++ idx++; ++ } ++ return NUMA_NO_NODE; ++} ++EXPORT_SYMBOL_GPL(xenbus_node_for_queue); ++ ++int xenbus_setup_ring_node(struct xenbus_device *dev, gfp_t gfp, int node, ++ void **vaddr, unsigned int nr_pages, ++ grant_ref_t *grefs) + { + unsigned long ring_size = nr_pages * XEN_PAGE_SIZE; ++ unsigned int order; ++ unsigned long nr_alloc; ++ struct page *page; + grant_ref_t gref_head; + unsigned int i; + void *addr; + int ret; + +- addr = *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO); +- if (!*vaddr) { ++ *vaddr = NULL; ++ ++ /* ++ * Mirror the GFP filtering that alloc_pages_exact() does ++ * internally: split_page() below requires a non-compound page ++ * and HIGHMEM is incompatible with the direct virt mapping used ++ * by the grant code. ++ */ ++ gfp &= ~(__GFP_COMP | __GFP_HIGHMEM); ++ ++ order = get_order(ring_size); ++ page = alloc_pages_node(node, gfp | __GFP_ZERO, order); ++ if (!page) { + ret = -ENOMEM; + goto err; + } + ++ /* ++ * alloc_pages_node returns a single order-N block where only ++ * the head is refcounted. split_page makes every subpage ++ * individually refcounted so free_pages_exact() can release the ++ * ring page-by-page. Return any tail pages beyond ring_size to ++ * the allocator immediately. ++ */ ++ split_page(page, order); ++ nr_alloc = 1UL << order; ++ for (i = DIV_ROUND_UP(ring_size, PAGE_SIZE); i < nr_alloc; i++) ++ __free_page(page + i); ++ ++ addr = page_address(page); ++ *vaddr = addr; ++ + ret = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (ret) { + xenbus_dev_fatal(dev, ret, "granting access to %u ring pages", +@@ -508,6 +579,20 @@ int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, + + return ret; + } ++EXPORT_SYMBOL_GPL(xenbus_setup_ring_node); ++ ++/* ++ * xenbus_setup_ring ++ * ++ * Equivalent to xenbus_setup_ring_node() with no node preference; the ++ * pages come from the current CPU's local node by default GFP policy. ++ */ ++int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, ++ unsigned int nr_pages, grant_ref_t *grefs) ++{ ++ return xenbus_setup_ring_node(dev, gfp, NUMA_NO_NODE, vaddr, nr_pages, ++ grefs); ++} + EXPORT_SYMBOL_GPL(xenbus_setup_ring); + + /* +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index 18b902bf79ef..8ce096797b86 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -216,6 +216,10 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + const char *pathfmt, ...); + + int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); ++int xenbus_node_for_queue(unsigned int index); ++int xenbus_setup_ring_node(struct xenbus_device *dev, gfp_t gfp, int node, ++ void **vaddr, unsigned int nr_pages, ++ grant_ref_t *grefs); + int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, + unsigned int nr_pages, grant_ref_t *grefs); + void xenbus_teardown_ring(void **vaddr, unsigned int nr_pages, +-- +2.54.0 + diff --git a/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch b/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch new file mode 100644 index 0000000..7d63b90 --- /dev/null +++ b/patches/0008-xen-netfront-place-per-queue-rings-on-per-queue-node.patch @@ -0,0 +1,270 @@ +From 2dd13fab395903f26e7f37eaeacec05c8fdd0233 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 19:21:52 -0700 +Subject: [PATCH 8/9] xen-netfront: place per-queue rings on per-queue nodes, + with XPS + +Today every netfront queue allocates its tx and rx rings from +xenbus_setup_ring(), which has no node preference and pulls from the +buddy free list of whichever node the xenbus watch handler is on. +On a multi-queue device, every queue's rings end up on the same +guest node. Combined with vNUMA->host-node mapping that puts each +guest node on its own host node, this funnels all of dom0's +per-queue backend kthreads onto a single host node and defeats the +multi-queue parallelism the dom0 backend is otherwise prepared to +deliver. + +Pick a per-queue node with xenbus_node_for_queue(queue->id) and pass +it to xenbus_setup_ring_node() for both tx and rx rings. Same-ring +locality is preserved by construction (one buddy allocation comes +from one node); different rings of different queues now land on +different nodes on multi-vnode guests. Single-vnode guests +degenerate to node 0 for every queue, identical to the previous +behaviour. + +Thread the same node through setup_netfront_split() and +setup_netfront_single() so the per-queue evtchn IRQs are bound with +bind_evtchn_to_irqhandler_lateeoi_on_node(). The underlying desc is +then allocated with the right node attribute and irqbalance treats +each IRQ as NUMA-local rather than floating. + +Apply irq_set_affinity_and_hint() to each queue's tx/rx IRQ at +connect time using the same per-queue node. NAPI runs in softirq on +the CPU that took the IRQ; landing IRQ + NAPI + ring on one node +keeps the receive path NUMA-local. Sets both actual affinity and +hint so behaviour is correct on guests without irqbalance; operator +writes to /proc/irq/N/smp_affinity still win. + +Install an XPS map for each queue mapping the node's cpumask to that +queue's index. Without this, __netdev_pick_tx falls back to +hash-based queue selection and a sender on any CPU can land on any +queue regardless of where its data wants to live; the ring-placement +work above is then wasted because the actual sender-to-queue +pairing is random. With XPS in place, a sender on a CPU in node N +selects queue N, whose rings are on node N, whose dom0 backend +kthread is on the host node hosting those rings: an end-to-end +NUMA-local TX path with no cross-node payload movement up to the +hypervisor boundary. + +In xennet_disconnect_backend(), clear the hint with +irq_update_affinity_hint(irq, NULL) before each unbind_from_irqhandler +in both the shared- and split-evtchn paths. free_irq() warns if the +hint is still set at teardown (kernel/irq/manage.c:1865); reconnects +would otherwise WARN. + +Operator writes to /proc/sys/net/.../xps_cpus continue to win on +subsequent writes -- this only provides a sensible default. XPS is +also conditional: CONFIG_XPS off makes netif_set_xps_queue a stub, +and an empty cpumask (memory-only NUMA node, possible if unusual on +guests) skips the install to avoid programming an effectively-unusable +map. On reconnect, the new setup_netfront calls overwrite the XPS +map with fresh values; the netdev-scoped map is freed when the +netdev unregisters. + +Signed-off-by: Steven Noonan +--- + drivers/net/xen-netfront.c | 107 ++++++++++++++++++++++++++++++------- + 1 file changed, 88 insertions(+), 19 deletions(-) + +diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c +index a11a0e949400..7c18db3de8d4 100644 +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -31,10 +31,12 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -1824,9 +1826,17 @@ static void xennet_disconnect_backend(struct netfront_info *info) + + timer_delete_sync(&queue->rx_refill_timer); + +- if (queue->tx_irq && (queue->tx_irq == queue->rx_irq)) ++ /* ++ * free_irq() warns if affinity_hint is still set. Drop the ++ * hint installed at connect time before tearing the IRQ down. ++ */ ++ if (queue->tx_irq && (queue->tx_irq == queue->rx_irq)) { ++ irq_update_affinity_hint(queue->tx_irq, NULL); + unbind_from_irqhandler(queue->tx_irq, queue); ++ } + if (queue->tx_irq && (queue->tx_irq != queue->rx_irq)) { ++ irq_update_affinity_hint(queue->tx_irq, NULL); ++ irq_update_affinity_hint(queue->rx_irq, NULL); + unbind_from_irqhandler(queue->tx_irq, queue); + unbind_from_irqhandler(queue->rx_irq, queue); + } +@@ -1902,7 +1912,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) + return 0; + } + +-static int setup_netfront_single(struct netfront_queue *queue) ++static int setup_netfront_single(struct netfront_queue *queue, int node) + { + int err; + +@@ -1910,10 +1920,10 @@ static int setup_netfront_single(struct netfront_queue *queue) + if (err < 0) + goto fail; + +- err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn, +- xennet_interrupt, 0, +- queue->info->netdev->name, +- queue); ++ err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->tx_evtchn, ++ xennet_interrupt, 0, ++ queue->info->netdev->name, ++ queue, node); + if (err < 0) + goto bind_fail; + queue->rx_evtchn = queue->tx_evtchn; +@@ -1928,7 +1938,7 @@ static int setup_netfront_single(struct netfront_queue *queue) + return err; + } + +-static int setup_netfront_split(struct netfront_queue *queue) ++static int setup_netfront_split(struct netfront_queue *queue, int node) + { + int err; + +@@ -1941,18 +1951,20 @@ static int setup_netfront_split(struct netfront_queue *queue) + + snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name), + "%s-tx", queue->name); +- err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn, +- xennet_tx_interrupt, 0, +- queue->tx_irq_name, queue); ++ err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->tx_evtchn, ++ xennet_tx_interrupt, 0, ++ queue->tx_irq_name, ++ queue, node); + if (err < 0) + goto bind_tx_fail; + queue->tx_irq = err; + + snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name), + "%s-rx", queue->name); +- err = bind_evtchn_to_irqhandler_lateeoi(queue->rx_evtchn, +- xennet_rx_interrupt, 0, +- queue->rx_irq_name, queue); ++ err = bind_evtchn_to_irqhandler_lateeoi_on_node(queue->rx_evtchn, ++ xennet_rx_interrupt, 0, ++ queue->rx_irq_name, ++ queue, node); + if (err < 0) + goto bind_rx_fail; + queue->rx_irq = err; +@@ -1977,6 +1989,7 @@ static int setup_netfront(struct xenbus_device *dev, + { + struct xen_netif_tx_sring *txs; + struct xen_netif_rx_sring *rxs; ++ int node; + int err; + + queue->tx_ring_ref = INVALID_GRANT_REF; +@@ -1984,32 +1997,88 @@ static int setup_netfront(struct xenbus_device *dev, + queue->rx.sring = NULL; + queue->tx.sring = NULL; + +- err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&txs, +- 1, &queue->tx_ring_ref); ++ /* ++ * Distribute queues across guest NUMA nodes by rotating over ++ * nodes-with-CPUs. On a single-vnode guest every queue lands ++ * on node 0 and behaviour matches the legacy default. On a ++ * multi-vnode guest, queues spread across nodes and pair up ++ * naturally with the dom0 backend's per-queue node-affinity ++ * placement. ++ */ ++ node = xenbus_node_for_queue(queue->id); ++ ++ err = xenbus_setup_ring_node(dev, GFP_NOIO | __GFP_HIGH, node, ++ (void **)&txs, 1, &queue->tx_ring_ref); + if (err) + goto fail; + + XEN_FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); + +- err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&rxs, +- 1, &queue->rx_ring_ref); ++ err = xenbus_setup_ring_node(dev, GFP_NOIO | __GFP_HIGH, node, ++ (void **)&rxs, 1, &queue->rx_ring_ref); + if (err) + goto fail; + + XEN_FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); + + if (feature_split_evtchn) +- err = setup_netfront_split(queue); ++ err = setup_netfront_split(queue, node); + /* setup single event channel if + * a) feature-split-event-channels == 0 + * b) feature-split-event-channels == 1 but failed to setup + */ + if (!feature_split_evtchn || err) +- err = setup_netfront_single(queue); ++ err = setup_netfront_single(queue, node); + + if (err) + goto fail; + ++ /* ++ * Route each per-queue evtchn IRQ toward the same node the ring ++ * lives on. NAPI runs in softirq on the CPU that took the IRQ; ++ * landing IRQ + NAPI + ring on one node keeps the receive path ++ * NUMA-local. Sets both actual affinity and hint so behaviour ++ * is correct on guests without irqbalance. Operator writes to ++ * /proc/irq/N/smp_affinity continue to win. ++ */ ++ if (node != NUMA_NO_NODE) { ++ const struct cpumask *mask = cpumask_of_node(node); ++ ++ if (!cpumask_empty(mask)) { ++ irq_set_affinity_and_hint(queue->tx_irq, mask); ++ if (queue->rx_irq != queue->tx_irq) ++ irq_set_affinity_and_hint(queue->rx_irq, mask); ++ } ++ } ++ ++ /* ++ * Steer senders toward this queue based on the same node the ++ * rings live on. __netdev_pick_tx consults the XPS map first; ++ * a sender on a CPU in `node` will pick `queue->id`, whose ++ * rings are on `node`, and dom0's matching backend kthread is ++ * pinned to the host node that hosts those rings. Without ++ * this, the kernel's default hash-based queue selection lets a ++ * sender on any node land on any queue, defeating the per-queue ++ * NUMA-locality story. An operator write to xps_cpus ++ * overrides on subsequent writes. ++ * ++ * netif_set_xps_queue silently returns 0 if CONFIG_XPS is off; ++ * an empty cpumask (e.g. memory-only NUMA node) is skipped to ++ * avoid programming an effectively-unusable map. ++ */ ++ if (node != NUMA_NO_NODE) { ++ const struct cpumask *mask = cpumask_of_node(node); ++ ++ if (!cpumask_empty(mask)) { ++ int xps_err = netif_set_xps_queue(queue->info->netdev, ++ mask, queue->id); ++ if (xps_err) ++ netdev_warn(queue->info->netdev, ++ "XPS setup failed for queue %u: %d\n", ++ queue->id, xps_err); ++ } ++ } ++ + return 0; + + fail: +-- +2.54.0 + diff --git a/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch b/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch new file mode 100644 index 0000000..59637c1 --- /dev/null +++ b/patches/0009-xen-blkfront-place-per-ring-buffers-on-per-hctx-node.patch @@ -0,0 +1,158 @@ +From 9f2eced2a6cb89bf0f8fcfc4889389880b595e39 Mon Sep 17 00:00:00 2001 +From: Steven Noonan +Date: Tue, 19 May 2026 19:25:21 -0700 +Subject: [PATCH 9/9] xen-blkfront: place per-ring buffers on per-hctx nodes + +Today every blkfront ring (one per hctx in multi-queue mode) is +allocated from xenbus_setup_ring() with no node preference, so all +of a multi-queue blkfront's rings end up on whichever node the +xenbus watch handler runs on. Combined with vNUMA->host-node +mapping, this funnels every dom0 backend xenblkd kthread onto a +single host node and defeats multi-queue parallelism. + +Pick a per-ring node with xenbus_node_for_queue(ring_idx) and pass it +to xenbus_setup_ring_node(). Different rings now land on different +guest nodes on multi-vnode guests; same-ring locality is preserved +by the underlying buddy allocation. + +The ring's index is recovered from its byte offset within +info->rinfo because struct blkfront_ring_info has a flex array +trailer and sizeof() is not the stride; the same arithmetic appears +in get_rinfo() and for_each_rinfo(). + +Bind the ring's evtchn IRQ on the ring's node using +bind_evtchn_to_irqhandler_lateeoi_on_node(). The desc is then +allocated with the right node attribute and irqbalance treats the +IRQ as NUMA-local. + +Apply irq_set_affinity_and_hint() to the per-ring IRQ using the same +node. The IRQ fires on a CPU in the ring's node; blk-mq routes the +completion onward to the original submitter via the request_irq +machinery, so the wake-up edge stays on the right node before that +handoff. Sets both actual affinity and hint; operator writes to +/proc/irq/N/smp_affinity continue to win. + +No XPS-equivalent steering is needed on the block side. blk-mq's +existing hctx-to-CPU map already routes submissions: a process +running on CPU C submits via the hctx blk-mq mapped to C, and our +ring node for that hctx is derived from the same node-rotation +helper. The submitter is on the same node as the ring by +construction. + +In blkif_free_ring(), clear the hint with +irq_update_affinity_hint(rinfo->irq, NULL) before +unbind_from_irqhandler(). free_irq() warns if the hint is still set +at teardown (kernel/irq/manage.c:1865); reconnects would otherwise +WARN. + +Single-vnode guests are unaffected: every ring's node resolves to +node 0 and behaviour matches the previous default. + +Signed-off-by: Steven Noonan +--- + drivers/block/xen-blkfront.c | 53 ++++++++++++++++++++++++++++++++---- + 1 file changed, 48 insertions(+), 5 deletions(-) + +diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c +index 04fc6b552c04..8414dc737157 100644 +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -35,12 +35,14 @@ + * IN THE SOFTWARE. + */ + ++#include + #include + #include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -1293,8 +1295,14 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) + xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages, + rinfo->ring_ref); + +- if (rinfo->irq) ++ if (rinfo->irq) { ++ /* ++ * free_irq() warns if affinity_hint is still set. Drop the ++ * hint installed at setup time before tearing the IRQ down. ++ */ ++ irq_update_affinity_hint(rinfo->irq, NULL); + unbind_from_irqhandler(rinfo->irq, rinfo); ++ } + rinfo->evtchn = rinfo->irq = 0; + } + +@@ -1684,9 +1692,29 @@ static int setup_blkring(struct xenbus_device *dev, + int err; + struct blkfront_info *info = rinfo->dev_info; + unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; ++ unsigned int ring_idx; ++ int node; ++ ++ /* ++ * Recover the ring index from its slot in info->rinfo. The same ++ * arithmetic is used by get_rinfo() and for_each_rinfo() (the ++ * struct has a flex array so sizeof() is not the stride). ++ */ ++ ring_idx = ((unsigned long)rinfo - (unsigned long)info->rinfo) / ++ info->rinfo_size; ++ ++ /* ++ * Distribute rings across guest NUMA nodes by rotating over ++ * nodes-with-CPUs. blk-mq's default hctx-to-CPU map is also ++ * NUMA-balanced (blk_mq_map_queues uses NUMA-aware distribution ++ * when topology is present), so a submitter on a node-N CPU ++ * lands on the hctx whose ring is on node N. No XPS-equivalent ++ * steering needed on the block side -- blk-mq already does it. ++ */ ++ node = xenbus_node_for_queue(ring_idx); + +- err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring, +- info->nr_ring_pages, rinfo->ring_ref); ++ err = xenbus_setup_ring_node(dev, GFP_NOIO, node, (void **)&sring, ++ info->nr_ring_pages, rinfo->ring_ref); + if (err) + goto fail; + +@@ -1696,8 +1724,9 @@ static int setup_blkring(struct xenbus_device *dev, + if (err) + goto fail; + +- err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt, +- 0, "blkif", rinfo); ++ err = bind_evtchn_to_irqhandler_lateeoi_on_node(rinfo->evtchn, ++ blkif_interrupt, 0, ++ "blkif", rinfo, node); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_evtchn_to_irqhandler failed"); +@@ -1705,6 +1734,20 @@ static int setup_blkring(struct xenbus_device *dev, + } + rinfo->irq = err; + ++ /* ++ * Route the ring's evtchn IRQ toward the same node the ring ++ * lives on. blk-mq completes requests on the submitting CPU ++ * via the request_irq path; firing the IRQ on the ring's node ++ * keeps the wake-up on the right node before blk-mq routes the ++ * completion onward. Sets both actual affinity and hint. ++ */ ++ if (node != NUMA_NO_NODE) { ++ const struct cpumask *mask = cpumask_of_node(node); ++ ++ if (!cpumask_empty(mask)) ++ irq_set_affinity_and_hint(rinfo->irq, mask); ++ } ++ + return 0; + fail: + blkif_free(info, 0); +-- +2.54.0 +