diff --git a/.gitignore b/.gitignore index 261203bea1..1c485b5379 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,4 @@ z_local_saved/ tags # Generated spell check config -.spellcheck-non-draft.yml +.spellcheck-non-draft.yml \ No newline at end of file diff --git a/assets/contributors.csv b/assets/contributors.csv index 6c417f27e0..59299b6a24 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -66,7 +66,7 @@ Daniel Nguyen,,,,, Joe Stech,Arm,JoeStech,joestech,, visualSilicon,,,,, Konstantinos Margaritis,VectorCamp,,,, -Kieran Hejmadi,,,,, +Kieran Hejmadi,Arm,kieranhejmadi01,kieran-hejmadi-88920815b,, Alex Su,,,,, Chaodong Gong,,,,, Owen Wu,Arm,,,, diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_index.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_index.md new file mode 100644 index 0000000000..7d69935ca7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_index.md @@ -0,0 +1,66 @@ +--- +title: Get started with memory access analysis with Arm Performix and the Arm MCP Server + +description: Learn how to profile memory access behavior in a C++ particle simulation on Arm Linux using the Arm Performix Memory Access recipe through the Arm MCP Server. + +minutes_to_complete: 45 + +who_is_this_for: This is an introductory topic for C++ developers who want to use Arm Performix and the Arm MCP Server to diagnose cache and translation behavior in applications running on Arm Neoverse systems. + +learning_objectives: + - Explain how L1 cache hits, TLB misses, and page walks affect C++ application runtime. + - Build and visualize the orbiting galaxies example on an Arm Linux target. + - Inspect and optimize particle data structure using insights from the memory access recipe + - Use the Arm MCP Server in combination with performix for an agentic solution. + +prerequisites: + - Access to an Arm Neoverse-based Linux metal instance. + - Basic understanding of memory hierarchy within a CPU + - Basic C++ development experience. + - Familiarity with the Linux command line. + +author: Kieran Hejmadi + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse +tools_software_languages: + - Arm Performix + - MCP + - C++ + - CMake + - Python + - Linux perf +operatingsystems: + - Linux + +further_reading: + - resource: + title: Identify code hotspots using Arm Performix through the Arm MCP Server + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/performix-mcp-agent/ + type: learning-path + - resource: + title: Find Code Hotspots with Arm Performix + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/cpu_hotspot_performix/ + type: learning-path + - resource: + title: Optimize application performance using Arm Performix CPU microarchitecture analysis + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/performix-microarchitecture/ + type: learning-path + - resource: + title: Automate x86-to-Arm application migration using Arm MCP Server + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/arm-mcp-server/ + type: learning-path + - resource: + title: Arm Performix + link: https://developer.arm.com/servers-and-cloud-computing/arm-performix + type: website + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_next-steps.md new file mode 100644 index 0000000000..727b395ddd --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # The weight controls the order of the pages. _index.md always has weight 1. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/agent_screen_shot.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/agent_screen_shot.png new file mode 100644 index 0000000000..f3fb21f93a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/agent_screen_shot.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/codex_prompt.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/codex_prompt.png new file mode 100644 index 0000000000..b7703c89ee Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/codex_prompt.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/data_layout_comparison_compressed.gif b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/data_layout_comparison_compressed.gif new file mode 100644 index 0000000000..d97b00a3e4 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/data_layout_comparison_compressed.gif differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/galaxy_compressed.gif b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/galaxy_compressed.gif new file mode 100644 index 0000000000..c62f07b6f5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/galaxy_compressed.gif differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-0.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-0.md new file mode 100644 index 0000000000..84ca0e7b19 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-0.md @@ -0,0 +1,72 @@ +--- +title: Background +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Review of the CPU memory hierarchy + +This Learning Path assumes you already understand memory hierarchy fundamentals. It is a recap, not an exhaustive explanation, and focuses on concepts used in the worked example. + +Modern Arm server CPUs use a hierarchy of memories to reduce the cost of loading and storing data. The fastest storage sits close to each CPU core, while larger memories sit farther away and take more cycles to access. + +You typically see: + +- L1 data cache (`L1d`) and L1 instruction cache (`L1i`) close to each core with each access usually taking up to 10 cycles. +- L2 cache, often private to each core, with each access usually taking 10-20 cycles. +- Last-level cache, often shared across multiple cores, and usually taking 20+ cycles. +- DRAM, which is much larger but much slower than on-chip cache. + +You can inspect cache topology on a Linux system with Arm's [sysreport](https://learn.arm.com/learning-paths/servers-and-cloud-computing/sysreport/) tool or the `lscpu` command. Unlike `lscpu`, Sysreport also reports the set associativity for each cache level. For example, you can run the following command on a system with `git` and `python` installed: + +```output +git clone https://github.com/ArmDeveloperEcosystem/sysreport.git +python3 src/sysreport.py | grep -i cache -A 4 + + cache info: size, associativity, sharing + cache line size: 64 + Caches: + 64 x L1D 64K 4-way 64b-line + 64 x L1I 64K 4-way 64b-line + 64 x L2U 1M 8-way 64b-line + 1 x L3U 32M 16-way 64b-line +``` + +For a more visual view, install `hwloc` and generate a topology image: + +```bash +sudo apt update +sudo apt install -y hwloc +hwloc-ls --of png > topology.png +``` + +![Hardware locality topology for an Arm server showing per-core L1 and L2 caches and a shared L3 cache across all cores, which helps you verify cache hierarchy before profiling.#center](./topology.png "Example hardware locality topology") + +The graphic above illustrates cache tiers on an AWS Graviton3 metal instance based on Neoverse V1. Each of the 64 cores has private `L1d`, `L1i`, and `L2` caches, and all cores share one `L3` cache, sometimes referred to as last-level cache (LLC). Cache sizes, especially at later levels, are not fixed by the Neoverse architecture; implementers such as AWS or Google can configure larger or smaller caches based on design goals. + +NUMA, or non-uniform memory access, means memory latency can depend on which processor or socket owns the memory being accessed. On this AWS Graviton3 instance, there is only one NUMA node. + +If you would like a comprehensive system-level understanding of the memory subsystem, review our learning path on the [Arm system characterisation tool](https://learn.arm.com/learning-paths/servers-and-cloud-computing/memory-subsystem/). + +## Definition of terms used in this learning path + +Applications use virtual addresses, which are the addresses a program sees instead of physical DRAM locations. Virtual addressing lets the operating system isolate processes, protect memory, and map each program's address space to available physical memory. The processor translates virtual addresses to physical addresses before it accesses memory. + +### Translation lookaside buffer (TLB) + +The translation lookaside buffer (TLB) caches recent virtual-to-physical translations at page granularity to avoid page table walks. A TLB miss occurs when the needed translation is not cached, so the processor performs a page table walk to find the mapping. Page walks add latency before a load or store can complete. Large working sets and irregular access patterns, such as strides larger than the typical 4KB page size, can increase TLB pressure because the program touches many pages with little reuse. + +### Page Faults + +A minor page fault is usually harmless: the data is already in RAM, and the kernel only creates the mapping. This commonly happens during anonymous paging when Linux lazily backs newly allocated heap or stack memory on first touch. A major page fault is more expensive because the kernel must fetch the page from disk, such as from a file or swap, so repeated major faults are usually a real performance concern. + +### Working set size + +The working set is the data your program actively touches during a period of execution. It differs from resident set size (RSS), which is the amount of physical memory currently resident for a process. A process can have a large RSS while the hot loop actively uses only a smaller working set. + +### Memory access from a programmers perspective + +From a programmer's perspective, much of the cache and memory subsystem is a black box defined by processor architecture and implementation. Features such as cache associativity, prefetching, and translation caching are designed to hide latency across many workloads. Your main software levers are data structure layout, allocation patterns, and choices such as page size. The layout of your C++ data structures can determine whether the memory hierarchy helps or hurts runtime. The compiler generally cannot reorder structure fields or split objects automatically because that would change program semantics. + diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-1.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-1.md new file mode 100644 index 0000000000..23f58b6ab1 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-1.md @@ -0,0 +1,78 @@ +--- +title: Build Example +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Install required packages and clone the repository + +Use your remote Arm Linux target for all build and run steps. This example uses an AWS `c7g.metal` instance running `Ubuntu 24.04 LTS`. + +## Install Arm Performix + +Install and configure Arm Performix using the [install guide](https://learn.arm.com/install-guides/performix/) on both your local machine and the remote Neoverse-based system. + +## Install the required system packages + +Run the following command, replacing `apt` with the package manager for your linux distribution. + +```bash +sudo apt update +sudo apt install -y git cmake build-essential python3 python3-venv python3-pip +``` +{{% notice Please Note %}} + +If you are running on an **AWS Ubuntu 24.04 LTS image**, you also need to enable SPE with the following commands. If you are running on another platform, see the [enable SPE learning path](https://github.com/ArmDeveloperEcosystem/arm-learning-paths/pull/3186). + +```bash +sudo apt install -y linux-modules-extra-$(uname -r) +sudo modprobe arm_spe_pmu +``` +{{% /notice %}} + +Clone the example: + +```bash +git clone https://github.com/arm-education/Orbiting-Galaxy-Example.git +cd Orbiting-Galaxy-Example +git checkout -b my-work v1.0.3 +``` + +## Build with CMake + +```bash +mkdir -p build +cd build +cmake .. +cmake --build . --parallel +``` + +This produces the workload binaries in `build/`. + +## Set up a Python virtual environment and run visualization + +From the repository root: + +```bash +cd .. +python3 -m venv venv +source venv/bin/activate +pip install --upgrade pip +pip install -r scripts/requirements.txt +``` + +Generate simulation frames and create the GIF: + +```bash +cd build +./baseline --visualize +python3 ../scripts/visualize.py galaxy_baseline.bin +``` + +The script reads simulation data from `galaxy_baseline.bin` and writes a GIF into `assets/`. + +![Animated orbiting galaxy simulation generated by the baseline workload, showing particle motion over time so you can verify that the simulation output looks correct before profiling.#center](galaxy_compressed.gif "Orbiting galaxies workload visualization") + +Use `--visualize` only for understanding the workload behavior. Do not include visualization mode in profiling runs because file I/O alters the measured runtime characteristics. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-2.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-2.md new file mode 100644 index 0000000000..5519c5eb88 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-2.md @@ -0,0 +1,101 @@ +--- +title: Inspect with Performix +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Inspect the particle data structure + +Start by inspecting the baseline particle model in `src/baseline/particle.hpp`. + +{{% notice Tip %}} + +If you are using an IDE or editor with an LLM-based coding assistant, the `AGENT.md` file can improve your learning experience. This file provides repository context and helps guide the agent to give more useful assistance. + +![Screenshot showing the AGENT.md file in the repository, highlighting the context file your coding assistant uses to provide more relevant guidance during this task.#center](./agent_screen_shot.png "Screenshot of GitHub Copilot in VSCode using AGENTS.md as a system prompt to act as a learning assistant.") + +{{% /notice %}} + +The baseline implementation stores every property for one particle in a single structure: + +```cpp +struct Particle { + float x, y, z; // position (12 bytes) + float vx, vy, vz; // velocity (12 bytes) + float mass, charge, temperature; // properties (12 bytes) + float pressure, energy, density; // (12 bytes) + float spin_x, spin_y, spin_z; // (12 bytes) + float pad; // padding (4 bytes) +}; +``` + +The ownership container in the same file is: + +```cpp +class ParticleOwner { + // Stores particle references used by the simulation. + std::vector particles_; +}; +``` + +The update loop in `src/baseline/baseline.cpp` repeatedly updates particle positions: + +```cpp +for (int iter = 0; iter < iters; ++iter) { + update_positions(particles.data(), NUM_PARTICLES, dt); +} +``` + +This baseline design can create avoidable memory overhead: + +- `ParticleOwner` stores pointers to separately allocated `Particle` objects, so the hot loop must follow an extra level of indirection. +- Each `Particle` is 64 bytes, but the position update only uses `x`, `y`, `z`, `vx`, `vy`, and `vz`. +- Loading whole particle objects can waste cache capacity and memory bandwidth when the loop only needs a subset of fields. + +Before you optimize anything, profile and measure. + +## Run the Performix Memory Access Recipe + +Open the Performix GUI on your local machine and select the **Memory Access** recipe. + +Configure the recipe to launch the baseline workload on your remote Arm target: + +- Select the configured remote target. +- Set **Workload type** to **Launch a new process**. +- Set **Workload** to the baseline executable: + +```output +/baseline +``` + +Keep the default profiling duration so Performix records until the workload exits. + +![Performix Memory Access recipe setup showing the selected remote Arm target and the workload path field populated with the baseline binary, which confirms the run configuration before profiling starts.#center](./setup.png "Configure the Performix Memory Access recipe") + +Start the recipe and wait for the results to load. + +## Assess Performance + +![Performix Memory Access results for the baseline binary showing update_positions with about 66 percent L1C load hits and around 26-cycle average L1C latency, indicating weak cache locality in the hot path.#center](./performix_before_optimizations.png "Baseline memory access results before optimization") + +Look at the memory access results for the baseline binary. Most samples are associated with the `update_positions()` function. The `L1C % Loads` value shows that only about two thirds of loads hit in L1 cache, and the average L1 cache load latency is about 26 cycles. A cache-friendly hot loop should have a much higher L1 hit rate and lower average latency. + +To investigate further, check the TLB walk data. As described in the background section, the TLB caches virtual-to-physical address translations. As per the image below, the `TLB Walk Breakdown` tab shows no significant TLB walks. That means address translation is not the main issue. + +![Performix Memory Access results show 0% TLB walks across all functions in the baseline binary, indicating that TLB pressure and costly address translation misses are not contributing to the performance issue.#center](./no_tlb_walks.png "TLB walk results showing 0 page table walks for all functions in baseline implementation") + +In summary: + +- Average load latency is about 26 cycles, indicating frequent accesses beyond L1 cache. +- SPE samples are concentrated in `update_positions()`, confirming this loop dominates execution. +- TLB misses are not significant, so page walks are not the source of the slowdown. + +Double-click the `update_positions()` row to open the source code view. The source view shows that the samples concentrate on the per-particle position updates. + +![Performix source code view for update_positions showing sample concentration on the x, y, and z update statements, helping you confirm that this loop is the main optimization target.#center](./source_code.png "Baseline source-level samples in update_positions") + +Given that the majority of samples are associated with accessing the `Particle` data structure and that we fall back to L2 cache ~1/3 of the time, to improve the execution time of this example we will need to focus on more efficient ways, if any, of accessing the `Particle` member variables. For example, there may be an alternative data structure that has better cache utilization. + +In the next section, you use this evidence to guide optimization. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-3.md b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-3.md new file mode 100644 index 0000000000..515e1f7ece --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/how-to-3.md @@ -0,0 +1,165 @@ +--- +title: Optimize with MCP +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Optimize manually + +The `src/users_solution/` directory is an editable copy of `src/baseline`. Using the data collected from Performix, refactor the `Particle` data structure and associated function signatures and call sites to improve the L1 cache hit rate. The baseline result showed that `update_positions()` dominated the samples, had a low L1 cache hit rate, and did not show significant TLB walks. + +{{% notice Hint %}} + +Consider how the `Particle` data structure maps to a 64-byte cache line. Also consider which member variables in the `Particle` struct are used in the hot loop. + +{{% /notice %}} + +Once you make changes in `src/users_solution/`, rebuild the binary with the following commands. + +```bash +cd +make clean +cmake --build . --parallel +``` +Use the Performix GUI to assess performance changes for the `/users_solution` binary. A reference solution is available in `src/optimized`. + +## (Optional) Optimize with an AI agent and the Arm MCP server + +If you have access to a code assistant such as Kiro, Gemini, Codex, or GitHub Copilot, you can also use the Arm Model Context Protocol (MCP) server. The MCP server includes direct tool support to invoke Performix on a remote target. It integrates with MCP-compatible coding assistants and can provide performance insights to create a useful feedback loop. The code samples below show how to connect to OpenAI Codex; for other tools, see [your preferred coding assistant](https://learn.arm.com/learning-paths/servers-and-cloud-computing/arm-mcp-server/1-overview/). + +{{% notice Please Note %}} + +You need an OpenAI account to use the Codex CLI. + +{{% /notice %}} + +[Install Docker](https://learn.arm.com/install-guides/docker/) and pull the MCP server image. + +```bash +docker pull armlimited/arm-mcp:latest +``` + +To ensure the MCP server can invoke `performix` on remote machines, pass optional Docker arguments for your SSH private key and known hosts file. For example, use this TOML format for the Codex CLI by adding the following to `~/.codex/config.toml`: + +```output +[mcp_servers.arm-mcp] +command = "docker" +args = [ + "run", + "--rm", + "-i", + "-v", "/path/to/your/workspace:/workspace", + "-v", "/path/to/your/ssh/private_key:/run/keys/ssh-key.pem:ro", + "-v", "/path/to/your/ssh/known_hosts:/run/keys/known_hosts:ro", + "armlimited/arm-mcp" +] +``` + +Restart Codex and ask your coding assistant to run the `memory access` recipe, interpret the results, and inspect the relevant source code. Your prompt can include the remote target, workload binary, and source directory: + +![Codex prompt requesting the Arm MCP server to run memory access and code hotspot recipes on the remote baseline workload, showing how to pass target, binary path, and source directory details.#center](./codex_prompt.png "Prompting Codex to analyze the baseline workload with Arm MCP") + +Alternatively, you can use the curated [arm-full-optimization.md](https://github.com/arm/mcp/blob/main/agent-integrations/codex/arm-full-optimization.md) prompt file. + +## Review the optimized solution + +A reference solution is available in the `src/optimized` directory. The baseline stores a vector of `Particle*` values, where each `Particle` is allocated separately and contains all particle fields in one 64-byte structure. The hot loop only needs `x`, `y`, `z`, `vx`, `vy`, and `vz`, but the baseline layout still steps through whole particle objects and performs unnecessary pointer chasing. + +The optimized version changes the layout to a Struct of Arrays (SoA). Each field is stored in its own contiguous `std::vector`: + +```cpp +struct ParticlesSoA { + std::vector x, y, z; + std::vector vx, vy, vz; + std::vector mass, charge, temperature; + std::vector pressure, energy, density; + std::vector spin_x, spin_y, spin_z; +}; +``` + +The `update_positions()` function then walks the hot position and velocity arrays directly: + +```cpp +void update_positions(ParticlesSoA& p, int n, float dt) { + for (int i = 0; i < n; ++i) { + p.x[i] += p.vx[i] * dt; + p.y[i] += p.vy[i] * dt; + p.z[i] += p.vz[i] * dt; + } +} +``` + +This removes `Particle*` indirection and improves cache-line utilization because the hot loop streams through only the data it uses. + +As the graphic below shows, the baseline implementation is on the right. Even though each particle is padded to a 64-byte cache line, many struct members are not read or written in the hot loop, so they remain cold. With a structure-of-arrays layout, all particles are still owned together, but cache lines contain more of the data that the loop actually touches. + +![Animation comparing baseline and structure-of-arrays layouts, showing how the optimized layout packs hot fields together so cache lines carry useful data for position updates.#center](./data_layout_comparison_compressed.gif) + +## Confirm with Performix + +After optimization, rerun the Performix Memory Access recipe on the optimized binary. In the Performix GUI, rerun the recipe and change the binary path from `/baseline` to `/optimized`. + +![Performix Memory Access results for the optimized binary showing 100 percent L1C load hits for the selected function and lower average L1C latency, confirming improved memory locality after the data layout change.#center](./performix_after_optimization.png "Memory access results after the Struct of Arrays optimization") + +The optimized result shows much stronger L1 cache behavior. The hot update path now has `100%` L1C loads in the captured result and a lower average L1C latency than the baseline. This confirms that the data layout change improved locality, not just wall-clock time. + +## Measure wall time and memory usage + +Run the binaries directly on the remote machine without Performix to compare both wall time and memory usage: + +```bash +/usr/bin/time -v /baseline +/usr/bin/time -v /optimized +``` + +The hot loop is also instrumented with `scopedTimer`, so you can directly observe the speedup from the change. + +```output +Baseline took 571 milliseconds + Command being timed: "./build/baseline" + User time (seconds): 0.66 + System time (seconds): 0.02 + Percent of CPU this job got: 99% + Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.69 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 92720 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 0 + Minor (reclaiming a frame) page faults: 22655 +... +Optimized took 279 milliseconds + Command being timed: "./build/optimized" + User time (seconds): 0.35 + System time (seconds): 0.02 + Percent of CPU this job got: 100% + Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.37 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 64044 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 0 + Minor (reclaiming a frame) page faults: 15500 +``` + + +| Metric | Baseline | Optimized | Explanation | +|-----------------------|--------------|--------------|---------------------------------------------------------------------------------------------| +| Wall time (ms) | 571 | 279 | The optimized layout improves cache usage and removes pointer chasing, roughly halving runtime. | +| Max RSS (KB) | 92,720 | 64,044 | Struct of Arrays reduces memory footprint by removing per-object overhead and cold fields. | +| Minor page faults | 22,655 | 15,500 | Fewer pages are touched due to more compact, contiguous storage of only needed data fields. | +| L1 cache hit rate (%) | 66.3 | 99.3 | Hot data is now accessed in a cache-friendly pattern, maximizing L1 cache effectiveness. | +| L1 avg latency (cycles)| 26.2 | 11.7 | Each L1 load takes fewer cycles because pointer chasing is removed. | + + +## Summary + +In this Learning Path, you used Performix and the Arm MCP Server to identify a memory access bottleneck in a C++ particle simulation. You connected the profile data to source code, found that the hot loop suffered from poor data layout and unnecessary pointer chasing, and improved the implementation with a Struct of Arrays layout. You then validated the change with direct wall-time measurements and a second Performix run. + +This approach combines measurement tools, code context, and focused prompts to iterate on real bottlenecks. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/no_tlb_walks.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/no_tlb_walks.png new file mode 100644 index 0000000000..046c5cb7b8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/no_tlb_walks.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_after_optimization.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_after_optimization.png new file mode 100644 index 0000000000..e3892b2437 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_after_optimization.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_before_optimizations.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_before_optimizations.png new file mode 100644 index 0000000000..470b0ba761 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/performix_before_optimizations.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/setup.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/setup.png new file mode 100644 index 0000000000..c3265e2ba5 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/setup.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/source_code.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/source_code.png new file mode 100644 index 0000000000..769da76b8d Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/source_code.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-memory-access/topology.png b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/topology.png new file mode 100644 index 0000000000..8e20f7f003 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-memory-access/topology.png differ