diff --git a/.github/workflows/test-configs.yml b/.github/workflows/test-configs.yml index 91175ac56b..f6c6d41a30 100644 --- a/.github/workflows/test-configs.yml +++ b/.github/workflows/test-configs.yml @@ -211,6 +211,24 @@ jobs: arch: ppc config-file: ./config/examples/nxp-t2080.config + # Additional T2080 board-specific compile tests for alternate board macros. + nxp_t2080_test_naii_68ppc2: + uses: ./.github/workflows/test-build-powerpc.yml + with: + arch: ppc + config-file: ./config/examples/nxp-t2080.config + make-args: CFLAGS_EXTRA=-DBOARD_NAII_68PPC2 + + # VPX3-152 compile test: validates board-specific code paths build cleanly. + # Uses default config addresses (128MB layout); real hardware needs the + # VPX3-152 address overrides uncommented in .config. + nxp_t2080_vpx3152_test: + uses: ./.github/workflows/test-build-powerpc.yml + with: + arch: ppc + config-file: ./config/examples/nxp-t2080.config + make-args: CFLAGS_EXTRA=-DBOARD_CW_VPX3152 + nxp_lpc54s0xx_test: uses: ./.github/workflows/test-build.yml with: diff --git a/arch.mk b/arch.mk index 261ace49c4..c34880117a 100644 --- a/arch.mk +++ b/arch.mk @@ -1072,6 +1072,18 @@ ifeq ($(TARGET),nxp_t2080) CFLAGS+=$(ARCH_FLAGS) BIG_ENDIAN=1 CFLAGS+=-DMMU -DWOLFBOOT_FDT -DWOLFBOOT_DUALBOOT + # Support U-Boot legacy uImage header: strip 64-byte header before jumping + # to the OS image (e.g. uVxWorks, uImage Linux kernel). + CFLAGS+=-DWOLFBOOT_UBOOT_LEGACY + # 64-bit OS support (VxWorks 7, Linux 64-bit): transitions LAW/TLB to + # 36-bit physical addressing before jumping to the OS. Equivalent to + # CW U-Boot "ossel ostype2". Set OS_64BIT=1 in .config to enable. + ifeq ($(OS_64BIT),1) + CFLAGS+=-DENABLE_OS64BIT + endif + ifneq ($(WOLFBOOT_BOOTARGS),) + CFLAGS+=-DWOLFBOOT_BOOTARGS='"$(WOLFBOOT_BOOTARGS)"' + endif CFLAGS+=-pipe # use pipes instead of temp files CFLAGS+=-feliminate-unused-debug-types LDFLAGS+=$(ARCH_FLAGS) diff --git a/config/examples/nxp-t2080.config b/config/examples/nxp-t2080.config index 6627f5b7bc..3a14e32773 100644 --- a/config/examples/nxp-t2080.config +++ b/config/examples/nxp-t2080.config @@ -1,13 +1,17 @@ # NXP T2080 wolfBoot Configuration # Default board: T2080 RDB (66.66 MHz oscillator, DDR3L SODIMM) # -# Board selection: uncomment exactly one line to override the default. -# Default (no define): T2080 RDB (66.66 MHz oscillator, DDR3L SODIMM) -# BOARD_CW_VPX3152: CW VPX3-152 (66.667 MHz oscillator, DDR3L) -# BOARD_NAII_68PPC2: NAII 68PPC2 (100 MHz oscillator, 8GB DDR3) +# Board selection: +# Default (no define): T2080 RDB (66.66 MHz oscillator, DDR3L SODIMM) +# BOARD_NAII_68PPC2: NAII 68PPC2 (100 MHz oscillator, 8 GB DDR3) +# BOARD_CW_VPX3152: CW VPX3-152 (66.667 MHz oscillator, 4 GB DDR3L) # -#CFLAGS_EXTRA+=-DBOARD_CW_VPX3152 +# For NAII 68PPC2, uncomment the line below (addresses are the same as RDB): #CFLAGS_EXTRA+=-DBOARD_NAII_68PPC2 +# +# For CW VPX3-152 (256 MB NOR flash at 0xF0000000), uncomment the BOARD +# define AND the address override block at the bottom of this file. +#CFLAGS_EXTRA+=-DBOARD_CW_VPX3152 ARCH=PPC TARGET=nxp_t2080 @@ -34,24 +38,24 @@ RAM_CODE?=1 DUALBANK_SWAP?=0 WOLFTPM?=0 OPTIMIZATION_LEVEL?=1 +ELF?=1 +DEBUG_ELF=0 -# NOR Base Address -# T2080 RDB: 128MB flash at 0xE8000000, wolfBoot at top (0xEFFE0000) -# CW VPX3-152: 256MB flash at 0xF0000000, wolfBoot at top (0xFFFE0000) +# ----------------------------------------------------------------------------- +# Default addresses: T2080 RDB / NAII 68PPC2 (128 MB NOR flash @ 0xE8000000) +# ----------------------------------------------------------------------------- + +# NOR Base Address: wolfBoot at top of flash ARCH_FLASH_OFFSET?=0xEFFE0000 -#ARCH_FLASH_OFFSET?=0xFFFE0000 # CW VPX3-152 # CPC SRAM address (must match L2SRAM_ADDR in nxp_ppc.h) -# CW VPX3-152: relocated to 0xEE900000 to avoid 256MB flash TLB overlap L2SRAM_ADDR?=0xF8F00000 -#L2SRAM_ADDR?=0xEE900000 # CW VPX3-152 # Flash Sector Size WOLFBOOT_SECTOR_SIZE?=0x10000 # wolfBoot start address WOLFBOOT_ORIGIN?=0xEFFE0000 -#WOLFBOOT_ORIGIN?=0xFFFE0000 # CW VPX3-152 # wolfBoot partition size (custom) BOOTLOADER_PARTITION_SIZE=0x20000 @@ -59,26 +63,36 @@ BOOTLOADER_PARTITION_SIZE=0x20000 WOLFBOOT_PARTITION_SIZE?=0x100000 # Location in Flash for Application Partition WOLFBOOT_PARTITION_BOOT_ADDRESS?=0xEFEE0000 -#WOLFBOOT_PARTITION_BOOT_ADDRESS?=0xFFEE0000 # CW VPX3-152 # Load Partition to RAM Address WOLFBOOT_LOAD_ADDRESS?=0x19000 # Location in Flash for Update Partition WOLFBOOT_PARTITION_UPDATE_ADDRESS?=0xEFDE0000 -#WOLFBOOT_PARTITION_UPDATE_ADDRESS?=0xFFDE0000 # CW VPX3-152 # Location of temporary sector used during updates WOLFBOOT_PARTITION_SWAP_ADDRESS?=0xEFDD0000 -#WOLFBOOT_PARTITION_SWAP_ADDRESS?=0xFFDD0000 # CW VPX3-152 # DTS (Device Tree) WOLFBOOT_DTS_BOOT_ADDRESS?=0xE8040000 -#WOLFBOOT_DTS_BOOT_ADDRESS?=0xF0040000 # CW VPX3-152 WOLFBOOT_DTS_UPDATE_ADDRESS?=0xE8050000 -#WOLFBOOT_DTS_UPDATE_ADDRESS?=0xF0050000 # CW VPX3-152 # DTS Load to RAM Address WOLFBOOT_LOAD_DTS_ADDRESS?=0x200000 +# ----------------------------------------------------------------------------- +# CW VPX3-152 address overrides (256 MB NOR flash @ 0xF0000000) +# Uncomment ALL lines below when building for VPX3-152. +# Also uncomment CFLAGS_EXTRA+=-DBOARD_CW_VPX3152 at the top of this file. +# ----------------------------------------------------------------------------- +#ARCH_FLASH_OFFSET=0xFFFE0000 +#L2SRAM_ADDR=0xEE900000 +#WOLFBOOT_ORIGIN=0xFFFE0000 +#WOLFBOOT_PARTITION_BOOT_ADDRESS=0xFFEE0000 +#WOLFBOOT_PARTITION_UPDATE_ADDRESS=0xFFDE0000 +#WOLFBOOT_PARTITION_SWAP_ADDRESS=0xFFDD0000 +#WOLFBOOT_DTS_BOOT_ADDRESS=0xF0040000 +#WOLFBOOT_DTS_UPDATE_ADDRESS=0xF0050000 +#WOLFBOOT_LOAD_DTS_ADDRESS=0xF000000 + # Flash erase/write/read test at update partition address #TEST_FLASH?=1 diff --git a/docs/Targets.md b/docs/Targets.md index bbce15e5cb..54de779cbc 100644 --- a/docs/Targets.md +++ b/docs/Targets.md @@ -4659,16 +4659,28 @@ Flash factory_custom.bin to NOR base 0xE800_0000 ## NXP QorIQ T2080 PPC -The NXP QorIQ T2080 is a PPC e6500 based processor (four cores). Support has been tested with the NAII 68PPC2. +The NXP QorIQ T2080 is a PPC e6500 based processor (four cores). Three board +variants are supported: + +| Board | Config Define | Oscillator | DDR | NOR Flash | +|-------|---------------|-----------|-----|-----------| +| T2080 RDB (default) | _(none)_ | 66.66 MHz | DDR3L SODIMM | 128 MB @ `0xE8000000` | +| Curtiss-Wright VPX3-152 | `BOARD_CW_VPX3152` | 66.667 MHz | 4 GB DDR3L | 256 MB @ `0xF0000000` | +| NAII 68PPC2 | `BOARD_NAII_68PPC2` | 100 MHz | 8 GB DDR3 | 128 MB @ `0xE8000000` | + +> **Note:** The T2080 RDB DDR register values in `hal/nxp_t2080.h` are +> populated from a U-Boot register dump but have not been validated on +> hardware. The NAII 68PPC2 and CW VPX3-152 DDR configs are populated +> and tested. Example configuration: [/config/examples/nxp-t2080.config](/config/examples/nxp-t2080.config). -Stock layout is default; for NAII 68PPC2, uncomment the "# NAII 68PPC2:" lines and comment the stock lines. +See [Board Selection](#board-selection) below for per-board setup. ### Design NXP T2080 PPC -The QorIQ requires a Reset Configuration Word (RCW) to define the boot parameters, which resides at the start of the flash (0xE8000000). +The QorIQ requires a Reset Configuration Word (RCW) to define the boot parameters, which resides at the start of the flash (`0xE8000000` for 128 MB boards, `0xF0000000` for the 256 MB CW VPX3-152). -The flash boot entry point is `0xEFFFFFFC`, which is an offset jump to wolfBoot initialization boot code. Initially the PowerPC core enables only a 4KB region to execute from. The initialization code (`src/boot_ppc_start.S`) sets the required CCSR and TLB for memory addressing and jumps to wolfBoot `main()`. +The flash boot entry point is the last 4 bytes of the NOR flash region (`0xEFFFFFFC` for 128 MB flash, `0xFFFFFFFC` for 256 MB flash), which is an offset jump to wolfBoot initialization boot code. Initially the PowerPC core enables only a 4KB region to execute from. The initialization code (`src/boot_ppc_start.S`) sets the required CCSR and TLB for memory addressing and jumps to wolfBoot `main()`. #### Boot Sequence and Hardware Constraints @@ -4691,7 +4703,7 @@ CPC SRAM is unreliable for stores on cold power-on — L1 dirty-line evictions through CoreNet to CPC cause bus errors (silent CPU checkstop with `MSR[ME]=0`). The fix (matching U-Boot) uses L1 locked D-cache as the initial 16KB stack: `dcbz` allocates cache lines without bus reads, `dcbtls` locks them so they -are never evicted. The locked lines at `L1_CACHE_ADDR` (0xF8E00000) are +are never evicted. The locked lines at `L1_CACHE_ADDR` (`0xF8E00000`; `0xEE800000` on VPX3-152) are entirely core-local. After DDR init in `hal_init()`, the stack relocates to DDR and the CPC switches from SRAM to L3 cache mode. @@ -4702,9 +4714,27 @@ boot, allowing L1 I-cache to cache instruction fetches while preventing speculative prefetch to the IFC. C code switches to `MAS2_I | MAS2_G` during flash write/erase (command mode), then `MAS2_M` for full caching afterward. +**CCSRBAR Relocation (CW VPX3-152 only)** + +The default CCSRBAR at `0xFE000000` (16 MB) falls within the VPX3-152's 256 MB +flash VA range (`0xF0000000`–`0xFFFFFFFF`). The startup assembly relocates +CCSRBAR to `0xEF000000` (just below flash). The CPC SRAM and L1 cache addresses +are also relocated to `0xEE900000`/`0xEE800000` to avoid overlap. + +**Boot ROM TLB invalidation (CW VPX3-152 only)** + +For VPX3-152, TLB1 Entry 2 maps the full 256 MB flash at `0xF0000000-0xFFFFFFFF` +with IPROT. This range overlaps with the boot ROM TLB (default 4 KB at +`0xFFFFF000`, resized to 256 KB at `0xFFFC0000` by `shrink_default_tlb1`). +Overlapping TLB1 entries cause an e6500 multi-hit machine check. After Entry 2 +is created, the boot ROM TLB is cleared via `tlbwe` with `V=0` and `IPROT=0`; +Entry 2 then serves all instruction fetches for the flash region including the +boot ROM range. For NAII 68PPC2 and T2080 RDB (128 MB flash at `0xE8000000`), +there is no overlap and the boot ROM TLB remains valid alongside Entry 2. + **RAMFUNCTION Constraints** -The NAII 68PPC2 NOR flash (two S29GL01GS x8 in parallel, 16-bit bus) enters +The NOR flash (two S29GL01GS x8 in parallel, 16-bit bus) enters command mode bank-wide — instruction fetches during program/erase return status data instead of code. All flash write/erase functions are marked `RAMFUNCTION`, placed in `.ramcode`, copied to DDR, and remapped via TLB9. Key rules: @@ -4744,30 +4774,132 @@ machine check (exceptions instead of checkstop), debug, and recoverable interrupt enable. Branch prediction (BUCSR) is deferred to `hal_init()` after DDR stack relocation. -**UART Debug Checkpoints (`DEBUG_UART=1`)** +#### VxWorks 7 64-bit Boot Support (ENABLE_OS64BIT) + +When `ENABLE_OS64BIT` is set, `do_boot()` performs the additional handoff +work needed to launch a VxWorks 7 64-bit kernel (Curtiss-Wright `ossel=ostype2` +mode) or a 64-bit Linux kernel via the ePAPR convention. + +**ePAPR handoff:** wolfBoot passes the FDT pointer in `r3`, the IMA size in +`r7`, and `0x45504150` (`'EPAP'`) in `r6`. Other GPRs are zero. MSR is +`0x00002200` (`FP|DE`); the OS sets `MSR[CM]=1` itself within its first ~30 +instructions. + +**Final 64-bit memory map (FUM Table 2.5).** `hal_os64bit_map_transition()` +in `src/boot_ppc.c` builds the 36-bit-aliased peripheral map VxWorks 7 +expects on CW VPX3-152: + +| Effective Address | Physical Address | Region | +|---|---|---| +| `0xF000_0000` | `0xF_F000_0000` | Flash (256 MB) | +| `0xEF00_0000` | `0xF_EF00_0000` | CCSR (16 MB) | +| `0xEE40_0000` | `0xF_EE40_0000` | FPGA / NVRAM (4 MB span) | +| `0xEE00_0000` | `0xF_EE00_0000` | DCSR (4 MB) | +| `0xEC00_0000` | `0xF_EC00_0000` | QMan portals (32 MB) | +| `0xEA00_0000` | `0xF_EA00_0000` | BMan portals (32 MB) | +| `0xE000_0000` | `0xF_E000_0000` | PCIe1 (XMC) memory (256 MB rounded) | +| `0xC000_0000` | `0xD_0000_0000` | PCIe4 (Switch) memory (2 GB) | +| `0x0000_0000` | `0x0_0000_0000` | DDR identity (2 GB, slot 0) | + +**DDR at TLB1 slot 0.** VxWorks 7's early entry stub iterates TLB1 from +slot 1 upward invalidating each entry, then reads slot 0 expecting it to +contain the DDR mapping. wolfBoot pins DDR at slot 12 by default; the +OS-handoff transition invalidates slot 12 and writes DDR identity (2 GB, +`MAS3_SX|SW|SR`, `MAS2_M`, IPROT) at slot 0. + +**Spin-table.** `hal_mp_init()` places the secondary-core spin-table at +`bootpg - BOOT_ROM_SIZE`. For VxWorks 7 the bootpg is anchored just below +the FUM `/memory` hole at `0x7E40_0000` so `cpu-release-addr` lands inside +declared memory. Production CW U-Boot does not release secondaries before +the OS jump for `ossel=ostype2`, so wolfBoot also gates the DCFG_BRR +release and `LCC_BSTRH/L/AR` setup behind `!ENABLE_OS64BIT`. The OS +releases its own secondaries via the standard ePAPR spin-table protocol +(`cpu-release-addr` in the FDT). + +**FDT fixups.** `hal_dts_fixup()` populates `cpus/cpu@N/cpu-release-addr` +and `enable-method = "spin-table"` for every core, and sets +`status = "okay"` on the boot CPU and `"disabled"` on the rest. The +DTB's existing `/memory.reg` is left untouched if already populated +(matching production U-Boot's `fdt_fixup_memory` which only writes when +the node is missing). The DTB's bootargs is replaced with the +`WOLFBOOT_BOOTARGS` value from `.config` if defined. + +**RAMFUNCTION OS-jump trampoline.** wolfBoot is XIP from flash by default; +`wolfBoot_os64bit_jump()` is a `RAMFUNCTION` (lives in `.ramcode` / +DDR). Steps it performs in order: + +1. Copy the exception handler (`isr_empty`, ~208 bytes) from flash + `0xFFFE_0000` to DDR at `0x0080_0000` (4 KB-aligned), then re-point + `IVPR` to the DDR copy. Without this, the next step (switching + flash to cache-inhibit + guarded) would break the e6500 fetcher's + ability to service handler instructions, causing any subsequent + exception to silent-hang. Production U-Boot's `IVPR` likewise + targets its DDR-relocated code, not flash. +2. Call `hal_flash_cache_disable_pre_os()` (also `RAMFUNCTION`) which + switches the flash TLB to `MAS2_I|MAS2_G`, asserts DUART1 MCR=3 + (DTR+RTS, matching production U-Boot's pre-bootm value), and zeros + `TCR` to disable any leftover watchdog reset arming. +3. `sync; isync` to drain the pipeline. +4. Indirect-jump to the OS entry through `bctrl`. The bctrl is fetched + from DDR (the trampoline itself), matching the production U-Boot + pattern of running its final pre-OS instructions out of DDR. + +**Other VxWorks-driven adjustments:** + +- `CORES_PER_CLUSTER = 2` for T2080 (was 4). The T2080 has 2 clusters + of 2 cores; an off-by-cluster linear-core-ID computation in the MP + secondary path could otherwise miss the 4-entry spin table. +- T2080 rev-1 e6500 errata block at primary core reset and the + secondary boot path. Erratum A003999 (HDBCR1 |= 0x0100_0000) is + intentionally NOT applied because production CW U-Boot does not + apply it to T2080. +- Per-cluster L2 cache init at secondary boot. +- IFC chip-selects on CW VPX3-152: AMASK + `MSEL=GPCM` aligned with CW + U-Boot's CSPR programming. CSOR is left alone while wolfBoot is still + XIP from flash (writing CSOR would alter the GPCM timing of the very + flash we are fetching from). + +**Pre-OS state dump (`WOLFBOOT_PPC_PRE_OS_DUMP`).** Defining this +preprocessor flag enables a comprehensive snapshot just before the OS +jump (SPRs, CCSR, LAW, TLB1, FDT, IFC, DDR, DUART, spin-table, ePAPR +args, kernel-entry bytes). The format mirrors a parallel debug dump in +CW U-Boot's `arch/powerpc/lib/bootm.c::boot_jump_vxworks` so the two +outputs can be diff'd line-by-line when investigating handoff state +divergences. Off by default — the DUART DLAB toggle inside the dump +can stall the active console. -Assembly startup emits characters to UART0 (0xFE11C500, 115200 baud): +### Building wolfBoot for NXP T2080 PPC +By default wolfBoot will use `powerpc-linux-gnu-` cross-compiler prefix. These tools can be installed with the Debian package `gcc-powerpc-linux-gnu` (`sudo apt install gcc-powerpc-linux-gnu`). + +#### Board Selection + +Copy the example config and select your board: + +**T2080 RDB (default):** ``` -1 - CPC invalidate start A - L2 cluster enable start -2 - CPC invalidate done B - L2 cluster enabled -3 - CPC SRAM configured E - L1 cache setup -4 - SRAM LAW configured F - L1 I-cache enabled -5 - Flash TLB configured G - L1 D-cache enabled -6 - CCSRBAR TLB configured D - Stack ready (L1 locked cache) -7 - SRAM TLB configured Z - About to jump to C code -8 - CPC enabled +cp ./config/examples/nxp-t2080.config .config ``` -### Building wolfBoot for NXP T2080 PPC +**Curtiss-Wright VPX3-152:** +``` +cp ./config/examples/nxp-t2080.config .config +``` +Then in `.config`, uncomment `CFLAGS_EXTRA+=-DBOARD_CW_VPX3152` and all lines +marked with `# CW VPX3-152` (flash offset, SRAM address, origin, partition addresses, +DTS addresses). -By default wolfBoot will use `powerpc-linux-gnu-` cross-compiler prefix. These tools can be installed with the Debian package `gcc-powerpc-linux-gnu` (`sudo apt install gcc-powerpc-linux-gnu`). +**NAII 68PPC2:** +``` +cp ./config/examples/nxp-t2080.config .config +``` +Then in `.config`, uncomment `CFLAGS_EXTRA+=-DBOARD_NAII_68PPC2`. + +#### Build -The `make` creates a `factory.bin` image that can be programmed at `0xE8080000` -(For NAII 68PPC2, first edit `nxp-t2080.config` to uncomment the NAII 68PPC2 lines.) +The `make` creates a `factory.bin` image that can be programmed to the application partition address. ``` -cp ./config/examples/nxp-t2080.config .config make clean make keytools make @@ -4796,19 +4928,31 @@ CROSS_COMPILE_PATH=/opt/fsl-qoriq/2.0/sysroots/ppce6500-fsl-linux/usr ### Programming NXP T2080 PPC -NOR Flash Region: `0xE8000000 - 0xEFFFFFFF` (128 MB) +NOR Flash Regions: +- **T2080 RDB / NAII 68PPC2**: `0xE8000000 - 0xEFFFFFFF` (128 MB) +- **CW VPX3-152**: `0xF0000000 - 0xFFFFFFFF` (256 MB) -Flash Layout (with files): +Flash Layout (T2080 RDB / NAII 68PPC2, 128 MB flash): | Description | File | Address | | ----------- | ---- | ------- | -| Reset Configuration Word (RCW) | `68PPC2_RCW_v0p7.bin` | `0xE8000000` | +| Reset Configuration Word (RCW) | _(board-specific)_ | `0xE8000000` | | Frame Manager Microcode | `fsl_fman_ucode_t2080_r1.0.bin` | `0xE8020000` | | Signed Application | `test-app/image_v1_signed.bin` | `0xE8080000` | -| wolfBoot | `wolfboot.bin` | `0xEFF40000` | -| Boot Entry Point (with offset jump to init code) | | `0xEFFFFFFC` | +| wolfBoot | `wolfboot.bin` | `0xEFFE0000` | +| Boot Entry Point (offset jump to init code) | | `0xEFFFFFFC` | -Or program the `factory.bin` to `0xE8080000` +Flash Layout (CW VPX3-152, 256 MB flash): + +| Description | File | Address | +| ----------- | ---- | ------- | +| Reset Configuration Word (RCW) | _(board-specific)_ | `0xF0000000` | +| Frame Manager Microcode | `fsl_fman_ucode_t2080_r1.0.bin` | `0xF0020000` | +| Signed Application | `test-app/image_v1_signed.bin` | `0xF0080000` | +| wolfBoot | `wolfboot.bin` | `0xFFFE0000` | +| Boot Entry Point (offset jump to init code) | | `0xFFFFFFFC` | + +Or program the `factory.bin` to the application partition address. Example Boot Debug Output (with `DEBUG_UART=1`): @@ -4847,11 +4991,11 @@ See these TRACE32 demo script files: ``` DO flash_cfi.cmm -FLASH.ReProgram 0xEFF40000--0xEFFFFFFF /Erase -Data.LOAD.binary wolfboot.bin 0xEFF40000 +FLASH.ReProgram 0xEFFE0000--0xEFFFFFFF /Erase +Data.LOAD.binary wolfboot.bin 0xEFFE0000 FLASH.ReProgram.off -Data.LOAD.binary wolfboot.bin 0xEFF40000 /Verify +Data.LOAD.binary wolfboot.bin 0xEFFE0000 /Verify ``` Note: To disable the flash protection bits use: @@ -4869,7 +5013,11 @@ Data.Set 0xE8000000 %W 0x9090 Data.Set 0xE8000000 %W 0x0000 ``` -#### Flash Programming with CodeWarrior TAP +#### Flash Programming with CodeWarrior TAP (Experimental) + +> **Note:** CodeWarrior TAP debugging has not been validated for this target. +> Lauterbach TRACE32 is the recommended debug probe. The following steps are +> provided for reference only. In CodeWarrior use the `Flash Programmer` tool (see under Commander View -> Miscellaneous) * Connection: "CodeWarrior TAP Connection" @@ -4881,13 +5029,113 @@ In CodeWarrior use the `Flash Programmer` tool (see under Commander View -> Misc ``` tftp 1000000 wolfboot.bin -protect off eff40000 +C0000 -erase eff40000 +C0000 -cp.b 1000000 eff40000 C0000 -protect on eff40000 +C0000 -cmp.b 1000000 eff40000 C0000 +protect off effe0000 +20000 +erase effe0000 +20000 +cp.b 1000000 effe0000 20000 +protect on effe0000 +20000 +cmp.b 1000000 effe0000 20000 ``` +#### CW VPX3-152 PABS Recovery and Testing + +The CW VPX3-152 has a Permanent Alternate Boot Site (PABS) — a second U-Boot on a +separate flash device. When jumper JB1 (ALT-BOOT) is installed and the board is reset, +it boots from PABS U-Boot (prompt: `VPX3-152 PABS=>`), which can reprogram the main +NOR flash via TFTP. This is used for wolfBoot development and testing. + +Reference: CW VPX3-152 Firmware User's Manual (838400 rev 6), Section 6. + +**Prerequisites:** +- JB1: Controlled by Pi4 GPIO 16 relay (or physical jumper) +- JB5: Must be removed (NOR write protect disabled) +- NVMRO: Must be grounded +- Serial: COM1 at 115200 N81 (P2 connector) +- Ethernet: GE02 (FM1@DTSEC1) on P1 connector + +**Entering PABS mode:** +1. Install JB1 jumper (or assert GPIO 16 high) +2. Reset the board +3. Board boots to `VPX3-152 PABS=>` prompt + +**Network setup in PABS U-Boot:** +``` +setenv serverip 10.0.4.24 +setenv ipaddr 10.0.4.152 +setenv gatewayip 10.0.4.1 +setenv netmask 255.255.255.0 +``` + +**Flash wolfBoot from PABS:** +``` +tftp 0x1000000 wolfboot.bin +protect off 0xFFFE0000 0xFFFFFFFF +erase 0xFFFE0000 0xFFFFFFFF +cp.b 0x1000000 0xFFFE0000 $filesize +cmp.b 0x1000000 0xFFFE0000 $filesize +``` + +**Flash signed application from PABS:** +``` +tftp 0x1000000 image_v1_signed.bin +protect off 0xFFEE0000 0xFFFDFFFF +erase 0xFFEE0000 0xFFFDFFFF +cp.b 0x1000000 0xFFEE0000 $filesize +cmp.b 0x1000000 0xFFEE0000 $filesize +``` + +**Boot wolfBoot:** Remove JB1 jumper (or deassert GPIO 16), reset the board. + +**Restore original CW U-Boot (from PABS):** +``` +fwupd 608603-100_rev- +``` + +**DDR Register Verification:** + +The CW VPX3-152 DDR register values in `hal/nxp_t2080.h` were obtained from a +U-Boot register dump. To verify or update these values, boot into PABS or main +U-Boot and run the following `md.l` commands. Use CCSRBAR `0xEF000000` (CW U-Boot +relocates CCSRBAR) or `0xFE000000` (default, check with `bdinfo`): + +``` +# CS Bounds and Config (DDR_BASE + 0x000, 0x080, 0x0C0) +md.l 0xef008000 4; md.l 0xef008080 4; md.l 0xef0080c0 4 +# Timing (DDR_BASE + 0x100, 0x160) +md.l 0xef008100 4; md.l 0xef008160 3 +# Config/Mode/Clock (DDR_BASE + 0x110, 0x130) +md.l 0xef008110 8; md.l 0xef008130 1 +# ZQ/Write Leveling (DDR_BASE + 0x170, 0x190) +md.l 0xef008170 3; md.l 0xef008190 2 +# RCW/Mode3-8 (DDR_BASE + 0x180, 0x200) +md.l 0xef008180 2; md.l 0xef008200 6 +# Control Driver (DDR_BASE + 0xB28) +md.l 0xef008b28 2 +# Error registers (DDR_BASE + 0xE40, 0xE58) +md.l 0xef008e40 3; md.l 0xef008e58 1 +``` + +**Flashing wolfBoot via PABS U-Boot:** + +The PABS U-Boot maps main NOR flash starting at `0x80000000`. To convert wolfBoot +flash addresses to PABS addresses, replace the `0xF` prefix with `0x8` (e.g. +`0xFFFE0000` becomes `0x8FFE0000`). After configuring the network, use: + +``` +# Flash wolfBoot (128 KB at top of flash) +tftp 0x1000000 wolfboot.bin +erase 0x8FFE0000 +0x20000 +cp.b 0x1000000 0x8FFE0000 $filesize +cmp.b 0x1000000 0x8FFE0000 $filesize + +# Flash signed application (1 MB boot partition) +tftp 0x1000000 image_v1_signed.bin +erase 0x8FEE0000 +0x100000 +cp.b 0x1000000 0x8FEE0000 $filesize +cmp.b 0x1000000 0x8FEE0000 $filesize +``` + +Remove the JB1 jumper and power cycle to boot from main flash with wolfBoot. + ### Debugging NXP T2080 PPC #### Lauterbach @@ -4915,9 +5163,11 @@ sYmbol.SourcePATH.SetBaseDir ~/wolfBoot Data.LOAD.Elf wolfboot.elf /NoCODE /StripPART "/home/username/wolfBoot/" ``` -#### CodeWarrior TAP +#### CodeWarrior TAP (Experimental) -This is an example for debugging the T2080 with CodeWarrior TAP, however we were not successful using it. The Lauterbach is what we ended up using to debug. +> **Note:** CodeWarrior TAP debugging has not been validated for this target. +> Lauterbach TRACE32 is the recommended debug probe. The following steps are +> provided for reference only. Start GDB Proxy: diff --git a/hal/nxp_ppc.h b/hal/nxp_ppc.h index 43b6c820a6..eb835088d5 100644 --- a/hal/nxp_ppc.h +++ b/hal/nxp_ppc.h @@ -84,7 +84,7 @@ #define ENABLE_L2_CACHE /* setup and enable L2 in first stage only */ #else /* relocate to 64-bit 0xF_ */ - #define CCSRBAR_PHYS_HIGH 0xFULL + #define CCSRBAR_PHYS_HIGH 0xF #define CCSRBAR_PHYS (CCSRBAR_PHYS_HIGH + CCSRBAR_DEF) #endif @@ -95,9 +95,9 @@ #define FLASH_BASE_ADDR 0xEC000000UL #ifndef BUILD_LOADER_STAGE1 - #define FLASH_BASE_PHYS_HIGH 0xFULL /* 36-bit: 0xF_EC000000 */ + #define FLASH_BASE_PHYS_HIGH 0xF /* 36-bit: 0xF_EC000000 */ #else - #define FLASH_BASE_PHYS_HIGH 0x0ULL /* 32-bit stage1 */ + #define FLASH_BASE_PHYS_HIGH 0x0 /* 32-bit stage1 */ #endif #define FLASH_LAW_SIZE LAW_SIZE_64MB #define FLASH_TLB_PAGESZ BOOKE_PAGESZ_64M @@ -136,7 +136,7 @@ #define ENABLE_L2_CACHE #else /* relocate to 64-bit 0xF_ */ - #define CCSRBAR_PHYS_HIGH 0xFULL + #define CCSRBAR_PHYS_HIGH 0xF #define CCSRBAR_PHYS (CCSRBAR_PHYS_HIGH + CCSRBAR_DEF) #endif @@ -148,9 +148,9 @@ /* 128MB NOR: 0xE8000000 - 0xEFFFFFFF */ #define FLASH_BASE_ADDR 0xE8000000UL #ifndef BUILD_LOADER_STAGE1 - #define FLASH_BASE_PHYS_HIGH 0xFULL /* 36-bit: 0xF_E8000000 */ + #define FLASH_BASE_PHYS_HIGH 0xF /* 36-bit: 0xF_E8000000 */ #else - #define FLASH_BASE_PHYS_HIGH 0x0ULL /* 32-bit stage1 */ + #define FLASH_BASE_PHYS_HIGH 0x0 /* 32-bit stage1 */ #endif #define FLASH_LAW_SIZE LAW_SIZE_128MB /* e5500 BookE has no 128M page size (64M->256M), use 256M TLB */ @@ -159,14 +159,14 @@ #define USE_LONG_JUMP #elif defined(TARGET_nxp_t2080) - /* NXP T2080 */ + /* NXP T2080: 4 cores in 2 clusters of 2 (each core has 2 threads) */ #define CORE_E6500 #define CPU_NUMCORES 4 - #define CORES_PER_CLUSTER 4 + #define CORES_PER_CLUSTER 2 #define LAW_MAX_ENTRIES 32 #define ENABLE_PPC64 - #define CCSRBAR_DEF (0xFE000000UL) /* T2080RM 4.3.1 default base */ + #define CCSRBAR_DEF 0xFE000000 /* T2080RM 4.3.1 default base */ #define CCSRBAR_SIZE BOOKE_PAGESZ_16M #define ENABLE_L1_CACHE @@ -199,13 +199,22 @@ #define INITIAL_SRAM_BOOKE_SZ BOOKE_PAGESZ_1M #define ENABLE_INTERRUPTS + #define ENABLE_FMAN #ifdef BOARD_CW_VPX3152 - /* Relocate CCSRBAR: default 0xFE000000 (16MB) falls within 256MB flash - * VA range 0xF0000000-0xFFFFFFFF. Move to 0xEF000000 (just below flash). - * The existing relocation code in boot_ppc_start.S handles the hardware - * CCSRBAR register write when CCSRBAR_DEF != CCSRBAR_PHYS. */ - #define CCSRBAR 0xEF000000UL + /* Relocate CCSRBAR: default 0xFE000000 (16MB) falls within the 256MB + * flash VA range 0xF0000000-0xFFFFFFFF. Move to 0xEF000000 (just below + * flash). The existing relocation code in boot_ppc_start.S handles the + * hardware CCSRBAR register write when CCSRBAR_DEF != CCSRBAR_PHYS. + * + * CCSR is mapped at PA=0xF_EF000000 (36-bit-aliased, PHYS_HIGH=0xF) to + * match the cw_152_64.dtb soc.ranges entry. A 32-bit alternative + * (PA=0x0_EF000000) was tested during bring-up but is not selectable + * in this header. */ + #define CCSRBAR 0xEF000000 + #define CCSRBAR_PHYS_HIGH 0xF + #define CCSRBAR_NEW_REG 0x00FEF000 /* (PHYS_HIGH << 20) | (CCSRBAR >> 12) */ + #define CCSRBAR_PHYS ((0xFULL << 32) | CCSRBAR_DEF) #endif #define ENABLE_DDR @@ -217,11 +226,13 @@ #endif #endif - /* DDR stack configuration - relocate from CPC SRAM after DDR init - * Stack is at top of first 32MB of DDR, with 64KB reserved for stack - * Stack grows downward from DDR_STACK_TOP */ + /* DDR stack configuration - relocate from CPC SRAM after DDR init. + * Stack must be ABOVE the image load area to avoid being overwritten + * when the OS image is copied to WOLFBOOT_LOAD_ADDRESS (0x100000). + * With WOLFBOOT_PARTITION_SIZE=0x800000 the image area ends at 0x900000. + * Place stack at 16MB to be safely above the image + DTS regions. */ #define DDR_STACK_SIZE (64 * 1024) /* 64KB stack in DDR */ - #define DDR_STACK_TOP 0x02000000UL /* Top of first 32MB */ + #define DDR_STACK_TOP 0x01000000UL /* 16MB - above image area */ #define DDR_STACK_BASE (DDR_STACK_TOP - DDR_STACK_SIZE) /* DDR address where .ramcode is copied before CPC SRAM is released. @@ -235,12 +246,12 @@ * LAW0: addr=0xF_F000_0000, size=256MB, target=IFC. */ #ifdef BOARD_CW_VPX3152 #define FLASH_BASE_ADDR 0xF0000000UL /* 256MB NOR flash (0xF0000000-0xFFFFFFFF) */ - #define FLASH_BASE_PHYS_HIGH 0xFULL + #define FLASH_BASE_PHYS_HIGH 0xF #define FLASH_LAW_SIZE LAW_SIZE_256MB #define FLASH_TLB_PAGESZ BOOKE_PAGESZ_256M #else #define FLASH_BASE_ADDR 0xE8000000UL - #define FLASH_BASE_PHYS_HIGH 0x0ULL + #define FLASH_BASE_PHYS_HIGH 0x0 #define FLASH_LAW_SIZE LAW_SIZE_128MB #define FLASH_TLB_PAGESZ BOOKE_PAGESZ_128M #endif @@ -265,7 +276,7 @@ /* CCSRBAR */ #ifndef CCSRBAR_DEF -#define CCSRBAR_DEF 0xFE000000UL +#define CCSRBAR_DEF 0xFE000000 #endif #ifndef CCSRBAR #define CCSRBAR CCSRBAR_DEF @@ -276,6 +287,13 @@ #ifndef CCSRBAR_PHYS_HIGH #define CCSRBAR_PHYS_HIGH 0 #endif +/* Encoding written into CCSRBAR for the legacy (e500/e500mc) relocation + * path: PHYS_HIGH in [23:20], PA[35:12] in the low bits. Boards that + * relocate CCSRBAR but don't use USE_CORENET_INTERFACE need this + * pre-computed because GAS @h/@l can't evaluate the shift/OR. */ +#ifndef CCSRBAR_NEW_REG +#define CCSRBAR_NEW_REG ((CCSRBAR_PHYS_HIGH << 20) | (CCSRBAR >> 12)) +#endif /* DDR */ @@ -435,6 +453,7 @@ #define LAW_TRGT_PCIE1 0x00 #define LAW_TRGT_PCIE2 0x01 #define LAW_TRGT_PCIE3 0x02 + #define LAW_TRGT_PCIE4 0x03 #define LAW_TRGT_DDR_1 0x10 /* Memory Complex 1 */ #define LAW_TRGT_DDR_2 0x11 #define LAW_TRGT_BMAN 0x18 /* Buffer Manager (control) */ @@ -612,6 +631,13 @@ #define SPRN_PVR 0x11F /* Processor Version */ #define SPRN_SVR 0x3FF /* System Version */ #define SPRN_HDBCR0 0x3D0 +#define SPRN_HDBCR1 0x3D1 +#define SPRN_HDBCR2 0x3D2 +#define SPRN_HDBCR7 0x277 +#define SPRN_DBCR0 0x134 +#define SPRN_DBCR1 0x135 +#define SPRN_EPCR 0x133 +#define SPRN_MMUCFG 0x3F7 /* Hardware Implementation-Dependent Registers */ #define SPRN_HID0 0x3F0 @@ -903,6 +929,17 @@ extern void dcache_disable(void); /* For older PPC compat use dcbf to flush spin table entry */ /* Note: spin-table must be cache-line aligned in memory */ #define EPAPR_MAGIC (0x45504150) /* Book III-E CPUs */ + +/* Initial Mapped Area size passed to OS in r7. U-Boot uses 64MB. + * Override via CFLAGS_EXTRA+=-DWOLFBOOT_BOOTMAPSZ=... */ +#ifndef WOLFBOOT_BOOTMAPSZ +#define WOLFBOOT_BOOTMAPSZ (64 * 1024 * 1024) /* 64MB */ +#endif + +/* Maximum DTS size for flush_cache before OS jump */ +#ifndef WOLFBOOT_DTS_MAX_SIZE +#define WOLFBOOT_DTS_MAX_SIZE (64 * 1024) /* 64KB */ +#endif #define ENTRY_ADDR_UPPER 0 #define ENTRY_ADDR_LOWER 4 #define ENTRY_R3_UPPER 8 diff --git a/hal/nxp_t2080.c b/hal/nxp_t2080.c index 3512f7a050..b8df7ff3b2 100644 --- a/hal/nxp_t2080.c +++ b/hal/nxp_t2080.c @@ -39,6 +39,7 @@ /* Forward declarations */ static void RAMFUNCTION hal_flash_unlock_sector(uint32_t sector); +void RAMFUNCTION hal_ifc_cs0_init(void); #ifdef ENABLE_MP static void hal_mp_init(void); #endif @@ -72,6 +73,12 @@ static void hal_mp_init(void); #define FLASH_UNLOCK_ADDR2 0x555 #endif +/* FLASH_CMD_SECTOR: sector used for flash command sequences that don't target + * a specific sector (reset, unlock, PPB entry/exit). AMD flash command decode + * only looks at the low address bits, so sector 0 works for all boards with + * a properly mapped full-flash TLB entry. */ +#define FLASH_CMD_SECTOR 0 + /* Flash IO Helpers */ #if FLASH_CFI_WIDTH == 16 #define FLASH_IO8_WRITE(sec, n, val) *((volatile uint16_t*)(FLASH_BASE_ADDR + (FLASH_SECTOR_SIZE * (sec)) + ((n) * 2))) = (((val) << 8) | (val)) @@ -125,8 +132,14 @@ void hal_ddr_init(void) #ifdef ENABLE_DDR uint32_t reg; - /* Map LAW for DDR */ + /* Map LAW for DDR — use full DDR size. + * For 4GB boards, a single 4GB LAW at PA 0x0 covers all DDR. + * CW U-Boot ostype2 uses: set_ddr_laws(0, ddr_size, DDR_1). */ +#if DDR_SIZE >= (4096ULL * 1024ULL * 1024ULL) + set_law(4, 0, DDR_ADDRESS, LAW_TRGT_DDR_1, LAW_SIZE_4GB, 0); +#else set_law(4, 0, DDR_ADDRESS, LAW_TRGT_DDR_1, LAW_SIZE_2GB, 0); +#endif /* If DDR is already enabled then just return */ reg = get32(DDR_SDRAM_CFG); @@ -164,12 +177,12 @@ void hal_ddr_init(void) /* DDR SDRAM mode configuration */ set32(DDR_SDRAM_MODE, DDR_SDRAM_MODE_VAL); set32(DDR_SDRAM_MODE_2, DDR_SDRAM_MODE_2_VAL); - set32(DDR_SDRAM_MODE_3, DDR_SDRAM_MODE_3_8_VAL); - set32(DDR_SDRAM_MODE_4, DDR_SDRAM_MODE_3_8_VAL); - set32(DDR_SDRAM_MODE_5, DDR_SDRAM_MODE_3_8_VAL); - set32(DDR_SDRAM_MODE_6, DDR_SDRAM_MODE_3_8_VAL); - set32(DDR_SDRAM_MODE_7, DDR_SDRAM_MODE_3_8_VAL); - set32(DDR_SDRAM_MODE_8, DDR_SDRAM_MODE_3_8_VAL); + set32(DDR_SDRAM_MODE_3, DDR_SDRAM_MODE_3_VAL); + set32(DDR_SDRAM_MODE_4, DDR_SDRAM_MODE_4_VAL); + set32(DDR_SDRAM_MODE_5, DDR_SDRAM_MODE_5_VAL); + set32(DDR_SDRAM_MODE_6, DDR_SDRAM_MODE_6_VAL); + set32(DDR_SDRAM_MODE_7, DDR_SDRAM_MODE_7_VAL); + set32(DDR_SDRAM_MODE_8, DDR_SDRAM_MODE_8_VAL); set32(DDR_SDRAM_MD_CNTL, DDR_SDRAM_MD_CNTL_VAL); /* DDR Configuration */ @@ -270,6 +283,110 @@ static void hal_cpld_init(void) #endif } +#ifdef ENABLE_FMAN +/* FMAN microcode upload for T2080. + * Firmware is in NOR flash at FMAN_FW_ADDR (typically 0xFFE60000). + * Uses same QE firmware format as T1040. */ +#ifndef FMAN_FW_ADDR +#define FMAN_FW_ADDR 0xFFE60000UL +#endif +#define FMAN_BASE (CCSRBAR + 0x400000UL) +#define FMAN_IRAM (FMAN_BASE + 0xC4000UL) +#define FMAN_IRAM_IADD ((volatile uint32_t*)(FMAN_IRAM + 0x0)) +#define FMAN_IRAM_IDATA ((volatile uint32_t*)(FMAN_IRAM + 0x4)) +#define FMAN_IRAM_IREADY ((volatile uint32_t*)(FMAN_IRAM + 0xC)) +#define FMAN_IRAM_IADD_AIE 0x80000000 +#define FMAN_IRAM_READY 0x80000000 + +/* Reuse QE firmware structures (same format as T1040) */ +#if (defined(__IAR_SYSTEMS_ICC__) && (__IAR_SYSTEMS_ICC__ > 8)) || \ + defined(__GNUC__) + #define QE_PACKED __attribute__ ((packed)) +#else + #define QE_PACKED +#endif +#define QE_MAX_RISC 4 +struct qe_header { + uint32_t length; + uint8_t magic[3]; + uint8_t version; +} QE_PACKED; +struct qe_soc { + uint16_t model; + uint8_t major; + uint8_t minor; +} QE_PACKED; +struct qe_microcode { + uint8_t id[32]; + uint32_t traps[16]; + uint32_t eccr; + uint32_t iram_offset; + uint32_t count; + uint32_t code_offset; + uint8_t major; + uint8_t minor; + uint8_t revision; + uint8_t padding; + uint8_t reserved[4]; +} QE_PACKED; +struct qe_firmware { + struct qe_header header; + uint8_t id[62]; + uint8_t split; + uint8_t count; + struct qe_soc soc; + uint8_t padding[4]; + uint64_t extended_modes; + uint32_t vtraps[8]; + uint8_t reserved[4]; + struct qe_microcode microcode[1]; +} QE_PACKED; + +static int hal_fman_init(void) +{ + const struct qe_firmware *fw = (const struct qe_firmware *)FMAN_FW_ADDR; + const struct qe_header *hdr = &fw->header; + unsigned int i; + + /* Check firmware magic */ + if (hdr->magic[0] != 'Q' || hdr->magic[1] != 'E' || hdr->magic[2] != 'F') { + wolfBoot_printf("FMAN: no firmware at 0x%x\n", FMAN_FW_ADDR); + return -1; + } + + for (i = 0; i < fw->count; i++) { + const struct qe_microcode *ucode = &fw->microcode[i]; + const uint32_t *code; + unsigned int j; + + if (!ucode->code_offset) + continue; + + code = (const uint32_t *)((const uint8_t *)fw + ucode->code_offset); + wolfBoot_printf("FMAN: uploading '%s' v%u.%u.%u\n", + ucode->id, ucode->major, ucode->minor, ucode->revision); + + set32(FMAN_IRAM_IADD, FMAN_IRAM_IADD_AIE); + for (j = 0; j < ucode->count; j++) { + set32(FMAN_IRAM_IDATA, code[j]); + } + set32(FMAN_IRAM_IADD, 0); + { + int timeout = 1000000; + while ((get32(FMAN_IRAM_IDATA) != code[0]) && --timeout) + ; + if (!timeout) { + wolfBoot_printf("FMAN: upload timeout\n"); + return -1; + } + } + set32(FMAN_IRAM_IREADY, FMAN_IRAM_READY); + } + + return 0; +} +#endif /* ENABLE_FMAN */ + #ifdef ENABLE_DDR /* Release CPC SRAM back to L2 cache mode. * Call after stack is relocated to DDR (done in boot_entry_C). @@ -300,6 +417,9 @@ static void hal_reconfigure_cpc_as_cache(void) *dst++ = *src++; } + /* Ensure all stores have drained before flushing cache lines */ + __asm__ __volatile__("sync" ::: "memory"); + /* Flush D-cache and invalidate I-cache for the DDR copy */ flush_cache(DDR_RAMCODE_ADDR, ramcode_size); @@ -386,8 +506,12 @@ static void hal_flash_enable_caching(void) void hal_init(void) { -#if defined(DEBUG_UART) && defined(ENABLE_CPLD) + uint32_t bucsr; +#ifdef DEBUG_UART + uint32_t ddr_ratio; + #ifdef ENABLE_CPLD uint32_t fw; + #endif #endif /* Enable timebase on core 0 */ @@ -411,9 +535,29 @@ void hal_init(void) (unsigned long)(hal_get_bus_clk() / 1000000)); wolfBoot_printf("Timebase: %lu MHz\n", (unsigned long)(TIMEBASE_HZ / 1000000)); + ddr_ratio = get32(CLOCKING_PLLDGSR); + ddr_ratio = ((ddr_ratio >> 1) & 0x3F); + wolfBoot_printf("DDR Clock: %lu MHz (%lu MT/s, ratio %lu:1)\n", + (unsigned long)(SYS_CLK / 1000000 * ddr_ratio), + (unsigned long)(SYS_CLK / 1000000 * ddr_ratio * 2), + (unsigned long)ddr_ratio); #endif hal_flash_init(); +#ifdef ENABLE_IFC + hal_ifc_cs0_init(); /* Set IFC CS0 BA to match flash TLB (RAMFUNCTION) */ +#endif + +#ifdef DEBUG_UART + /* Dump LAW BARH values to verify 36-bit addressing */ + wolfBoot_printf("LAW0: BARH=0x%x BARL=0x%x LAWAR=0x%x\n", + get32(LAWBARH(0)), get32(LAWBARL(0)), get32(LAWAR(0))); + wolfBoot_printf("LAW1: BARH=0x%x BARL=0x%x LAWAR=0x%x\n", + get32(LAWBARH(1)), get32(LAWBARL(1)), get32(LAWAR(1))); + wolfBoot_printf("LAW4: BARH=0x%x BARL=0x%x LAWAR=0x%x\n", + get32(LAWBARH(4)), get32(LAWBARL(4)), get32(LAWAR(4))); +#endif + hal_cpld_init(); #ifdef ENABLE_CPLD @@ -426,6 +570,10 @@ void hal_init(void) #endif #endif /* ENABLE_CPLD */ +#ifdef ENABLE_FMAN + hal_fman_init(); +#endif + #ifdef ENABLE_DDR /* Stack is already in DDR (relocated in boot_entry_C via * ddr_call_with_stack trampoline before main() was called). @@ -441,12 +589,18 @@ void hal_init(void) /* Enable branch prediction now that DDR stack and cache hierarchy * are fully configured. Disabled during early ASM boot to avoid * speculative fetches during hardware init. */ - { - uint32_t bucsr = BUCSR_STAC_EN | BUCSR_LS_EN | BUCSR_BBFI | BUCSR_BPEN; - __asm__ __volatile__("mtspr %0, %1; isync" :: "i"(SPRN_BUCSR), "r"(bucsr)); - } + bucsr = BUCSR_STAC_EN | BUCSR_LS_EN | BUCSR_BBFI | BUCSR_BPEN; + __asm__ __volatile__("mtspr %0, %1; isync" :: "i"(SPRN_BUCSR), "r"(bucsr)); #endif + /* Note: previously had a duplicate `set32(DCFG_BRR, 0x0F)` here + * mislabelled as "enable hardware threading" -- DCFG+0xE4 is + * actually DCFG_BRR (Boot Release Register), not threading enable, + * and hal_mp_init() already writes the same value. Removed: it + * caused secondaries to be released pre-OS, which CW U-Boot does + * not do for VxWorks 7 64-bit (ossel=ostype2). VxWorks releases + * its own secondaries via the ePAPR spin-table protocol. */ + #ifdef ENABLE_MP /* Start secondary cores AFTER CPC release and flash caching. * Secondary cores' L2 flash-invalidate on the shared cluster L2 @@ -499,6 +653,71 @@ static void RAMFUNCTION hal_flash_clear_wp(void) } } +/* Initialize IFC CS0 with the correct base address for the NOR flash. + * The RCW default may have BA=0 (CSPR=0x141) which doesn't match the + * flash LAW/TLB at FLASH_BASE_ADDR. CW U-Boot sets CSPR=0xF0000105. + * Must be called from RAMFUNCTION with flash TLB guarded. */ +void RAMFUNCTION hal_ifc_cs0_init(void) +{ + /* Match CW U-Boot IFC CS0 configuration exactly: + * CSPR_EXT=0x0F, CSPR=0xF0000105, AMASK=0xF0000000 + * BA=0xF000 (flash at 0xF0000000), PORT_SIZE=16-bit, GPCM, V=1 + * MSEL=GPCM (0x4) is required to match U-Boot -- previously this + * code set MSEL=NOR (0) which differed from CW U-Boot's CSPR. */ + uint32_t cspr = get32(IFC_CSPR(0)); + + /* Only update if BA doesn't match flash base */ + if ((cspr & 0xFFFF0000) != IFC_CSPR_PHYS_ADDR(FLASH_BASE_ADDR)) { + /* Clear V, update all IFC CS0 registers, re-enable V */ + set32(IFC_CSPR(0), cspr & ~IFC_CSPR_V); + __asm__ __volatile__("sync; isync"); + set32(IFC_CSPR_EXT(0), (uint32_t)FLASH_BASE_PHYS_HIGH); + set32(IFC_AMASK(0), IFC_AMASK_256MB); + set32(IFC_CSPR(0), IFC_CSPR_PHYS_ADDR(FLASH_BASE_ADDR) | + IFC_CSPR_PORT_SIZE_16 | + IFC_CSPR_MSEL_GPCM | IFC_CSPR_V); + __asm__ __volatile__("sync; isync"); + } + +#ifdef ENABLE_OS64BIT + /* IFC CS1/2/3 setup for VxWorks/Linux 64-bit (matches CW U-Boot + * post-init state). wolfBoot previously set CSPR but NOT AMASK, + * leaving AMASK = 0 -- chip-select region size is unbounded. + * AMASK values from CW U-Boot dump: + * CS1 (FPGA 8-bit @ 0xEE600000): AMASK=0xFFF80000 (512 KB) + * CS2 (NVRAM @ 0xEE700000): AMASK=0xFFF80000 (512 KB) + * CS3 (FPGA 32-bit @ 0xEE400000): AMASK=0xFFE00000 (2 MB) */ + + /* IFC CS1: FPGA 8-bit at 0xEE600000 (GPCM, 8-bit port, 512 KB) */ + set32(IFC_CSPR_EXT(1), 0xF); + set32(IFC_AMASK(1), IFC_AMASK_512KB); + set32(IFC_CSPR(1), IFC_CSPR_PHYS_ADDR(0xEE600000) | + IFC_CSPR_PORT_SIZE_8 | IFC_CSPR_MSEL_GPCM | IFC_CSPR_V); + + /* IFC CS2: NVRAM at 0xEE700000 (GPCM, 8-bit port, 512 KB) */ + set32(IFC_CSPR_EXT(2), 0xF); + set32(IFC_AMASK(2), IFC_AMASK_512KB); + set32(IFC_CSPR(2), IFC_CSPR_PHYS_ADDR(0xEE700000) | + IFC_CSPR_PORT_SIZE_8 | IFC_CSPR_MSEL_GPCM | IFC_CSPR_V); + + /* IFC CS3: FPGA 32-bit at 0xEE400000 (GPCM, 16-bit port, 2 MB) */ + set32(IFC_CSPR_EXT(3), 0xF); + set32(IFC_AMASK(3), IFC_AMASK_2MB); + set32(IFC_CSPR(3), IFC_CSPR_PHYS_ADDR(0xEE400000) | + IFC_CSPR_PORT_SIZE_16 | IFC_CSPR_MSEL_GPCM | IFC_CSPR_V); + + /* CSOR mismatch (wolfBoot at reset default 0xC vs CW U-Boot CS0= + * 0xF000801, CS1/3=0x2F0C0000, CS2=0x0F000000) intentionally NOT + * touched here: changing CS0 CSOR while wolfBoot is still XIPing + * from flash hangs the next instruction fetch (timing mismatch + * mid-fetch). Would need to be done from a RAM-resident path with + * flash text already cached, or after switching to DDR-resident + * code. Not critical for OS-handoff matching since the OS + * immediately reprograms IFC for its own timings. */ + __asm__ __volatile__("sync; isync"); +#endif +} + static void RAMFUNCTION hal_flash_unlock_sector(uint32_t sector) { /* AMD unlock sequence */ @@ -517,9 +736,9 @@ static int RAMFUNCTION hal_flash_ppb_unlock(uint32_t sector) uint32_t timeout; /* Enter PPB ASO (Address Space Overlay) */ - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR1, AMD_CMD_UNLOCK_START); - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR2, AMD_CMD_UNLOCK_ACK); - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR1, AMD_CMD_SET_PPB_ENTRY); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR1, AMD_CMD_UNLOCK_START); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR2, AMD_CMD_UNLOCK_ACK); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR1, AMD_CMD_SET_PPB_ENTRY); /* Read PPB status for target sector: DQ0=0 means protected. * On 16-bit bus, must read both chip lanes to check both devices. */ @@ -531,16 +750,16 @@ static int RAMFUNCTION hal_flash_ppb_unlock(uint32_t sector) if ((ppb_status & 0x01) == 0x01) { #endif /* Both chips report unprotected — exit PPB mode and return */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC1); - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC2); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC1); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC2); return 0; } /* Exit PPB ASO before calling printf (flash must be in read-array * mode for I-cache misses to fetch valid instructions) */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC1); - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC2); - FLASH_IO8_WRITE(0, 0, AMD_CMD_RESET); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC1); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC2); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_RESET); udelay(50); #ifdef DEBUG_FLASH @@ -549,24 +768,24 @@ static int RAMFUNCTION hal_flash_ppb_unlock(uint32_t sector) #endif /* Re-enter PPB ASO for erase */ - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR1, AMD_CMD_UNLOCK_START); - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR2, AMD_CMD_UNLOCK_ACK); - FLASH_IO8_WRITE(0, FLASH_UNLOCK_ADDR1, AMD_CMD_SET_PPB_ENTRY); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR1, AMD_CMD_UNLOCK_START); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR2, AMD_CMD_UNLOCK_ACK); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, FLASH_UNLOCK_ADDR1, AMD_CMD_SET_PPB_ENTRY); /* PPB Erase All (clears all sectors' PPBs) */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_PPB_UNLOCK_BC1); /* 0x80 */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_PPB_UNLOCK_BC2); /* 0x30 */ + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_PPB_UNLOCK_BC1); /* 0x80 */ + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_PPB_UNLOCK_BC2); /* 0x30 */ /* Wait for PPB erase completion — poll for toggle stop. * On 16-bit bus, read both chip lanes to ensure both complete. */ timeout = 0; do { #if FLASH_CFI_WIDTH == 16 - read1 = FLASH_IO16_READ(0, 0); - read2 = FLASH_IO16_READ(0, 0); + read1 = FLASH_IO16_READ(FLASH_CMD_SECTOR, 0); + read2 = FLASH_IO16_READ(FLASH_CMD_SECTOR, 0); #else - read1 = FLASH_IO8_READ(0, 0); - read2 = FLASH_IO8_READ(0, 0); + read1 = FLASH_IO8_READ(FLASH_CMD_SECTOR, 0); + read2 = FLASH_IO8_READ(FLASH_CMD_SECTOR, 0); #endif if (read1 == read2) break; @@ -574,11 +793,11 @@ static int RAMFUNCTION hal_flash_ppb_unlock(uint32_t sector) } while (timeout++ < 100000); /* 1 second */ /* Exit PPB ASO */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC1); - FLASH_IO8_WRITE(0, 0, AMD_CMD_SET_PPB_EXIT_BC2); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC1); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_SET_PPB_EXIT_BC2); /* Reset to read-array mode */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_RESET); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_RESET); udelay(50); if (timeout >= 100000) { @@ -663,6 +882,13 @@ int RAMFUNCTION hal_flash_write(uint32_t address, const uint8_t *data, int len) int ret = 0; uint32_t i, sector, offset, nwords; const uint32_t width_bytes = FLASH_CFI_WIDTH / 8; + uint32_t addr_off = address; + + /* Bounds check */ + if (addr_off >= FLASH_BASE_ADDR) + addr_off -= FLASH_BASE_ADDR; + if (addr_off + (uint32_t)len > FLASH_BANK_SIZE) + return -1; /* Enforce alignment to flash bus width */ if ((address % width_bytes) != 0 || (len % width_bytes) != 0) { @@ -688,7 +914,7 @@ int RAMFUNCTION hal_flash_write(uint32_t address, const uint8_t *data, int len) /* Reset flash to read-array mode in case previous operation left it * in command mode (e.g. after a timeout or incomplete operation) */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_RESET); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_RESET); udelay(50); /* Program one word at a time using AMD single-word program (0xA0). @@ -741,6 +967,13 @@ int RAMFUNCTION hal_flash_erase(uint32_t address, int len) { int ret = 0; uint32_t sector; + uint32_t addr_off = address; + + /* Bounds check */ + if (addr_off >= FLASH_BASE_ADDR) + addr_off -= FLASH_BASE_ADDR; + if (addr_off + (uint32_t)len > FLASH_BANK_SIZE) + return -1; /* adjust for flash base */ if (address >= FLASH_BASE_ADDR) @@ -752,7 +985,7 @@ int RAMFUNCTION hal_flash_erase(uint32_t address, int len) /* Reset flash to read-array mode in case previous operation left it * in command mode (e.g. after a timeout or incomplete operation) */ - FLASH_IO8_WRITE(0, 0, AMD_CMD_RESET); + FLASH_IO8_WRITE(FLASH_CMD_SECTOR, 0, AMD_CMD_RESET); udelay(50); while (len > 0) { @@ -839,8 +1072,9 @@ extern uint32_t _spin_table[]; extern uint32_t _spin_table_addr; /* DDR address of the spin table, set during hal_mp_init() and reused in - * hal_dts_fixup() for cpu-release-addr fixups. */ -static uint32_t g_spin_table_ddr = 0; + * hal_dts_fixup() for cpu-release-addr fixups. Also read by boot_ppc.c + * pre-jump dump to capture spin-table contents at handoff. */ +uint32_t g_spin_table_ddr = 0; extern uint32_t _bootpg_addr; /* Startup additional cores with spin table and synchronize the timebase. @@ -848,7 +1082,7 @@ extern uint32_t _bootpg_addr; static void hal_mp_up(uint32_t bootpg, uint32_t spin_table_ddr) { uint32_t all_cores, active_cores, whoami; - int timeout = 50, i; + int timeout = 10000, i; /* 10000 * 100us = 1s, matches U-Boot convention */ whoami = get32(PIC_WHOAMI); /* Get current running core number */ all_cores = ((1 << CPU_NUMCORES) - 1); /* mask of all cores */ @@ -857,6 +1091,23 @@ static void hal_mp_up(uint32_t bootpg, uint32_t spin_table_ddr) wolfBoot_printf("MP: Starting cores (boot page %p, spin table %p)\n", bootpg, spin_table_ddr); + /* Enable time base on current core only */ + set32(RCPM_PCTBENR, (1 << whoami)); + +#ifdef ENABLE_OS64BIT + /* VxWorks 7 64-bit (ossel=ostype2) handoff matches CW U-Boot's + * profile: BSTRL/BSTAR stay disabled, DCFG_BRR stays 0, and + * secondary cores remain held in reset. The OS releases its own + * secondaries via the ePAPR spin-table protocol using + * cpu-release-addr from the FDT, which still points at the + * spin_table_ddr we set up below. Skip the active release here. */ + (void)bootpg; + (void)all_cores; + (void)spin_table_ddr; + (void)active_cores; + (void)timeout; + (void)i; +#else /* Set the boot page translation register */ set32(LCC_BSTRH, 0); set32(LCC_BSTRL, bootpg); @@ -865,9 +1116,6 @@ static void hal_mp_up(uint32_t bootpg, uint32_t spin_table_ddr) LAW_SIZE_4KB)); (void)get32(LCC_BSTAR); /* read back to sync */ - /* Enable time base on current core only */ - set32(RCPM_PCTBENR, (1 << whoami)); - /* Release the CPU core(s) */ set32(DCFG_BRR, all_cores); __asm__ __volatile__("sync; isync; msync"); @@ -911,13 +1159,16 @@ static void hal_mp_up(uint32_t bootpg, uint32_t spin_table_ddr) /* Only re-enable timebase for boot core */ set32(RCPM_PCTBENR, (1 << whoami)); } +#endif /* !ENABLE_OS64BIT */ } static void hal_mp_init(void) { uint32_t *fixup = (uint32_t*)&_secondary_start_page; uint32_t bootpg, second_half_ddr, spin_table_ddr; - int i_tlb = 0; /* always 0 */ +#ifdef BOARD_CW_VPX3152 + volatile uint32_t *bp, *st; +#endif size_t i; const volatile uint32_t *s; volatile uint32_t *d; @@ -925,8 +1176,21 @@ static void hal_mp_init(void) /* Assign virtual boot page at end of LAW-mapped DDR region. * DDR LAW maps 2GB (LAW_SIZE_2GB) starting at DDR_ADDRESS. * DDR_SIZE may exceed 32-bit range (e.g. 8GB), so use the LAW-mapped - * size to ensure bootpg fits in 32 bits and is accessible. */ + * size to ensure bootpg fits in 32 bits and is accessible. + * + * VPX3-152 / VxWorks 7 ostype2: the cw_152_64.dtb has /memory.reg + * with a hole at 0x7E400000-0x7FFFFFFF (between region 2 ending at + * 0x7E3FFFFF and region 3 starting at 0x80000000). Putting bootpg / + * spin_table at 0x7FFFF000 / 0x7FFFE000 lands them inside that hole, + * which production CW U-Boot ostype2 also does -- but VxWorks may + * not install TLB mappings for hole addresses when walking /memory. + * For the silent-boot diagnosis, place bootpg JUST BELOW the hole + * (top of region 2) so spin_table is inside declared /memory. */ +#if defined(ENABLE_OS64BIT) && defined(BOARD_CW_VPX3152) + bootpg = DDR_ADDRESS + 0x7E400000UL - BOOT_ROM_SIZE; +#else bootpg = DDR_ADDRESS + 0x80000000UL - BOOT_ROM_SIZE; +#endif /* Second half boot page (spin loop + spin table) goes just below. * For XIP flash builds, .bootmp is in flash — secondary cores can't @@ -941,26 +1205,46 @@ static void hal_mp_init(void) flush_cache(bootpg, BOOT_ROM_SIZE); flush_cache(second_half_ddr, BOOT_ROM_SIZE); - /* Map reset page to bootpg so we can copy code there. - * Boot page translation will redirect secondary core fetches from - * 0xFFFFF000 to bootpg in DDR. */ - disable_tlb1(i_tlb); - set_tlb(1, i_tlb, BOOT_ROM_ADDR, bootpg, 0, /* tlb, epn, rpn, urpn */ - (MAS3_SX | MAS3_SW | MAS3_SR), (MAS2_I | MAS2_G), /* perms, wimge */ - 0, BOOKE_PAGESZ_4K, 1); /* ts, esel, tsize, iprot */ +#ifdef BOARD_CW_VPX3152 + /* VPX3-152: TLB1 Entry 2 (256MB flash) covers 0xF0000000-0xFFFFFFFF + * which includes BOOT_ROM_ADDR (0xFFFFF000). Creating a TLB1 Entry 0 + * at that VA would cause a multi-hit machine check on e6500. + * Instead, copy boot page code directly to DDR via the DDR TLB and + * flush D-cache to ensure secondary cores see the data. */ + + /* Copy first half (startup code) directly to DDR at bootpg */ + s = (const uint32_t*)fixup; + d = (volatile uint32_t*)bootpg; + for (i = 0; i < BOOT_ROM_SIZE/4; i++) { + d[i] = s[i]; + } + + /* Write _bootpg_addr and _spin_table_addr into the DDR copy */ + bp = (volatile uint32_t*)(bootpg + + ((uint32_t)&_bootpg_addr - (uint32_t)&_secondary_start_page)); + st = (volatile uint32_t*)(bootpg + + ((uint32_t)&_spin_table_addr - (uint32_t)&_secondary_start_page)); + *bp = second_half_ddr; + *st = spin_table_ddr; + + /* Flush boot page from D-cache to DDR so secondary cores see it */ + flush_cache(bootpg, BOOT_ROM_SIZE); +#else + /* Non-VPX3: map BOOT_ROM_ADDR → DDR bootpg via TLB1 Entry 0 with + * cache-inhibited attributes so writes go directly to DDR. */ + disable_tlb1(0); + set_tlb(1, 0, BOOT_ROM_ADDR, bootpg, 0, + (MAS3_SX | MAS3_SW | MAS3_SR), (MAS2_I | MAS2_G), + 0, BOOKE_PAGESZ_4K, 1); - /* Copy first half (startup code) to DDR via BOOT_ROM_ADDR mapping. - * Uses cache-inhibited TLB to ensure data reaches DDR immediately. */ + /* Copy first half (startup code) to DDR via BOOT_ROM_ADDR mapping */ s = (const uint32_t*)fixup; - d = (uint32_t*)BOOT_ROM_ADDR; + d = (volatile uint32_t*)BOOT_ROM_ADDR; for (i = 0; i < BOOT_ROM_SIZE/4; i++) { d[i] = s[i]; } - /* Write _bootpg_addr and _spin_table_addr into the DDR first-half copy. - * These variables are .long 0 in the linked .bootmp (flash), and direct - * stores to their flash addresses silently fail on XIP builds. - * Calculate offsets within the boot page and write via BOOT_ROM_ADDR. */ + /* Write _bootpg_addr and _spin_table_addr into the DDR copy */ { volatile uint32_t *bp = (volatile uint32_t*)(BOOT_ROM_ADDR + ((uint32_t)&_bootpg_addr - (uint32_t)&_secondary_start_page)); @@ -969,12 +1253,12 @@ static void hal_mp_init(void) *bp = second_half_ddr; *st = spin_table_ddr; } +#endif /* Copy second half (spin loop + spin table) directly to DDR. - * Master has DDR TLB (entry 12, MAS2_M). Flush cache after copy - * to ensure secondary cores see the data. */ + * Flush cache after copy to ensure secondary cores see the data. */ s = (const uint32_t*)&_second_half_boot_page; - d = (uint32_t*)second_half_ddr; + d = (volatile uint32_t*)second_half_ddr; for (i = 0; i < BOOT_ROM_SIZE/4; i++) { d[i] = s[i]; } @@ -990,7 +1274,37 @@ static void hal_mp_init(void) void hal_prepare_boot(void) { + /* Intentionally minimal. Flash TLB switch to cache-inhibit and any + * other pre-OS-jump state changes happen in boot_ppc.c::do_boot() + * AFTER the FDT fixups + debug prints, since those run from flash + * and would each take many ms each on uncached IFC reads. */ +} +/* Public wrapper for boot_ppc.c::do_boot() - switch flash TLB to + * MAS2_I|MAS2_G to match CW U-Boot's pre-VxWorks state (TLB#2 WIMG=I|G, + * MAS2=0xF000000A). Mismatched flash cache attributes between + * bootloader and OS can cause stale instruction fetches if the OS reads + * from flash. Must be called AFTER all FDT walks / debug prints -- + * those run from flash and become very slow once cache is off. + * + * Also aligns small but observable pre-jump state items to CW U-Boot's + * profile when chasing VxWorks 7 64-bit silent boot: + * - DUART1 MCR = 3 (DTR+RTS asserted; U-Boot sets this, our driver + * leaves it at the post-reset 0) + * - TCR = 0x04000000 (matches U-Boot's leftover; wolfBoot was clearing + * it; VxWorks 7 BSP early code may inherit) */ +void RAMFUNCTION hal_flash_cache_disable_pre_os(void) +{ + hal_flash_cache_disable(); +#ifdef ENABLE_OS64BIT + /* DUART1 modem control: DTR+RTS asserted, matching CW U-Boot's + * pre-bootm value. */ + set8(UART_MCR(0), 0x03); + /* TCR=0 matches CW U-Boot's pre-bootm value. WRC != 0 would let + * the watchdog fire silently after VxWorks starts. */ + mtspr(SPRN_TCR, 0); + __asm__ __volatile__("isync" ::: "memory"); +#endif } #ifdef MMU @@ -1025,21 +1339,102 @@ int hal_dts_fixup(void* dts_addr) fdt_totalsize(fdt)); } - /* fixup the memory region - single bank */ +#ifdef ENABLE_OS64BIT + /* /memreserve/ entries: keep VxWorks/Linux away from the spin-table + * page (wolfBoot places it at g_spin_table_ddr, page-aligned down) + * and the top-of-DDR scratch pages. CW U-Boot's ostype2 reserves its + * own spin-table page (0x7fee4000); wolfBoot's spin table lives + * elsewhere, so we reserve based on the actual runtime address. */ + { + int rsv_ret; + uint64_t spin_pg = (uint64_t)(g_spin_table_ddr & ~0xFFFU); + + rsv_ret = fdt_add_mem_rsv(fdt, spin_pg, 0x1000ULL); + if (rsv_ret != 0) { + wolfBoot_printf("FDT: failed to reserve spin-table page " + "@ 0x%llx: %d\n", spin_pg, rsv_ret); + return rsv_ret; + } + rsv_ret = fdt_add_mem_rsv(fdt, 0x7ffff000ULL, 0x1000ULL); + if (rsv_ret != 0) { + wolfBoot_printf("FDT: failed to reserve boot page " + "@ 0x7ffff000: %d\n", rsv_ret); + return rsv_ret; + } + rsv_ret = fdt_add_mem_rsv(fdt, 0xfffff000ULL, 0x1000ULL); + if (rsv_ret != 0) { + wolfBoot_printf("FDT: failed to reserve top-of-4GB page " + "@ 0xfffff000: %d\n", rsv_ret); + return rsv_ret; + } + } +#endif + + /* fixup the memory region. + * + * IMPORTANT: production CW U-Boot's fdt_fixup_memory only writes + * /memory if the node is missing (`if (off < 0)`). The cw_152_64.dtb + * already has /memory.reg populated with the 3-region layout that + * VxWorks 7 64-bit expects. Overwriting it -- which wolfBoot was + * doing unconditionally -- causes VxWorks's libfdt-driven memory + * setup to disagree with what its assembly prologue installed and + * silent-fail before the UART driver comes up. Skip the write when + * the node already has a non-empty `reg` property. */ off = fdt_find_devtype(fdt, -1, "memory"); if (off >= 0) { - /* build addr/size as aligned 64-bit values */ - uint64_t ranges[2]; - ranges[0] = cpu_to_fdt64(DDR_ADDRESS); - ranges[1] = cpu_to_fdt64(DDR_SIZE); - wolfBoot_printf("FDT: Set memory, start=0x%x, size=0x%x\n", - DDR_ADDRESS, (uint32_t)DDR_SIZE); - fdt_setprop(fdt, off, "reg", ranges, sizeof(ranges)); + int reg_len = 0; + const void *existing = fdt_getprop(fdt, off, "reg", ®_len); + if (existing != NULL && reg_len > 0) { + wolfBoot_printf("FDT: /memory already has reg (%d bytes), keep\n", + reg_len); + goto memory_fixup_done; + } + } + if (off >= 0) { +#ifdef ENABLE_OS64BIT + /* For 64-bit OS: set 3-region memory layout matching CW U-Boot's + * fdt_fixup_memory_vxworks(). Regions avoid the peripheral hole + * at 0x7E400000-0x7FFFFFFF (reserved for CPU release/spin table). + * Region 3 maps upper DDR via the 36-bit LAW at PA 0xC_00000000. */ + uint64_t start[3], size[3]; + int num_regions = 2; + start[0] = cpu_to_fdt64(0x000000000ULL); + size[0] = cpu_to_fdt64(0x040000000ULL); /* 1GB */ + start[1] = cpu_to_fdt64(0x040000000ULL); + size[1] = cpu_to_fdt64(0x03E400000ULL); /* ~993MB (1G-4M for CPU release) */ + #if DDR_SIZE >= (4096ULL * 1024ULL * 1024ULL) /* 4GB+ */ + num_regions = 3; + start[2] = cpu_to_fdt64(0x080000000ULL); + size[2] = cpu_to_fdt64(DDR_SIZE - 0x080000000ULL); /* upper DDR */ + #endif + wolfBoot_printf("FDT: Set memory (%d regions, OS 64-bit)\n", num_regions); + { + uint64_t reg[6]; /* max 3 start/size pairs */ + int i; + for (i = 0; i < num_regions; i++) { + reg[i*2] = start[i]; + reg[i*2+1] = size[i]; + } + fdt_setprop(fdt, off, "reg", reg, num_regions * 2 * sizeof(uint64_t)); + } +#else + /* 32-bit OS: single contiguous DDR region */ + { + uint64_t ranges[2]; + ranges[0] = cpu_to_fdt64(DDR_ADDRESS); + ranges[1] = cpu_to_fdt64(DDR_SIZE); + wolfBoot_printf("FDT: Set memory, start=0x%x, size=0x%x\n", + DDR_ADDRESS, (uint32_t)DDR_SIZE); + fdt_setprop(fdt, off, "reg", ranges, sizeof(ranges)); + } +#endif } +memory_fixup_done: /* fixup CPU status and release address and enable method */ off = fdt_find_devtype(fdt, -1, "cpu"); while (off >= 0) { + uint32_t thread_id; int core; #ifdef ENABLE_MP uint64_t core_spin_table; @@ -1048,41 +1443,89 @@ int hal_dts_fixup(void* dts_addr) reg = (uint32_t*)fdt_getprop(fdt, off, "reg", NULL); if (reg == NULL) break; - core = (int)fdt32_to_cpu(*reg); + thread_id = fdt32_to_cpu(*reg); + #ifdef CORE_E6500 + /* e6500 has 2 threads per core. DTB reg values are thread IDs + * (0,1 for core0; 2,3 for core1; 4,5 for core2; 6,7 for core3). + * Convert to physical core ID for spin table indexing. */ + core = (int)(thread_id >> 1); + #else + core = (int)thread_id; + #endif if (core >= CPU_NUMCORES) { - break; /* invalid core index */ + /* Skip invalid cores but continue scanning */ + off = fdt_find_devtype(fdt, off, "cpu"); + continue; } #ifdef ENABLE_MP - /* Calculate DDR address of this core's spin table entry. - * Must use g_spin_table_ddr (the DDR copy), NOT _spin_table which - * is the flash/VMA address — Linux writes the release word to this - * address, and XIP flash is read-only. */ - core_spin_table = (uint64_t)(g_spin_table_ddr + (core * ENTRY_SIZE)); - - fdt_fixup_str(fdt, off, "cpu", "status", (core == 0) ? "okay" : "disabled"); - fdt_fixup_val64(fdt, off, "cpu", "cpu-release-addr", core_spin_table); + /* All cores get cpu-release-addr and enable-method = "spin-table" + * (matches CW U-Boot ostype2 fixup). Boot CPU also gets a release + * addr — it isn't actually waiting there, but VxWorks/Linux still + * read the property and reject the node if absent or zero. */ + core_spin_table = (uint64_t)(g_spin_table_ddr + + (core * ENTRY_SIZE)); + fdt_fixup_val64(fdt, off, "cpu", "cpu-release-addr", + core_spin_table); fdt_fixup_str(fdt, off, "cpu", "enable-method", "spin-table"); + if (core == 0) { + fdt_fixup_str(fdt, off, "cpu", "status", "okay"); + } + else { + fdt_fixup_str(fdt, off, "cpu", "status", "disabled"); + } #endif + #ifndef BOARD_CW_VPX3152 + /* CW VPX3-152: skip cpu/soc/clockgen/serial frequency fixups — + * the CW base DTB (cw_152_64.dtb) already has the correct values, + * and CW U-Boot's ft_cpu_setup() does not touch them. Adding them + * here produces a divergent FDT vs the known-working U-Boot path + * (extra cpu freq properties + off-by-1/3 rounding on soc/clockgen). + * Other T2080 targets (RDB) still need these because their base + * DTBs lack the properties. */ fdt_fixup_val(fdt, off, "cpu", "timebase-frequency", TIMEBASE_HZ); fdt_fixup_val(fdt, off, "cpu", "clock-frequency", hal_get_core_clk()); fdt_fixup_val(fdt, off, "cpu", "bus-frequency", hal_get_plat_clk()); + #endif off = fdt_find_devtype(fdt, off, "cpu"); } +#ifndef BOARD_CW_VPX3152 /* fixup the soc clock */ off = fdt_find_devtype(fdt, -1, "soc"); if (off >= 0) { fdt_fixup_val(fdt, off, "soc", "bus-frequency", hal_get_plat_clk()); } + /* fixup clockgen frequency — VxWorks/Linux use this to derive all + * clocks via PLL ratios. Match U-Boot's ft_cpu_setup behavior. */ + off = fdt_node_offset_by_compatible(fdt, -1, "fsl,qoriq-clockgen-2.0"); + if (off >= 0) { + fdt_fixup_val(fdt, off, "clockgen", "clock-frequency", SYS_CLK); + } + /* fixup the serial clocks */ off = fdt_find_devtype(fdt, -1, "serial"); while (off >= 0) { fdt_fixup_val(fdt, off, "serial", "clock-frequency", hal_get_bus_clk()); off = fdt_find_devtype(fdt, off, "serial"); } +#endif + + /* fixup /chosen bootargs -- override the DTB's baked-in bootargs + * with WOLFBOOT_BOOTARGS from .config. Production CW U-Boot leaves + * the DTB value untouched during bootm; we override here so users + * can change boot parameters without reflashing the DTB. */ +#ifdef WOLFBOOT_BOOTARGS + off = fdt_find_node_offset(fdt, -1, "chosen"); + if (off < 0) { + off = fdt_add_subnode(fdt, 0, "chosen"); + } + if (off >= 0) { + fdt_fixup_str(fdt, off, "chosen", "bootargs", WOLFBOOT_BOOTARGS); + } +#endif #endif /* !BUILD_LOADER_STAGE1 */ (void)dts_addr; diff --git a/hal/nxp_t2080.h b/hal/nxp_t2080.h index 8d8f92d6d8..45b666c431 100644 --- a/hal/nxp_t2080.h +++ b/hal/nxp_t2080.h @@ -214,6 +214,7 @@ enum ifc_amask_sizes { #define CLOCKING_CLKCCSR(n) ((volatile uint32_t*)(CLOCKING_BASE + 0x000UL + ((n) * 0x20))) #define CLOCKING_PLLCNGSR(n) ((volatile uint32_t*)(CLOCKING_BASE + 0x800UL + ((n) * 0x20))) /* PLL cluster n general status */ #define CLOCKING_PLLPGSR ((volatile uint32_t*)(CLOCKING_BASE + 0xC00UL)) /* Platform PLL general status */ +#define CLOCKING_PLLDGSR ((volatile uint32_t*)(CLOCKING_BASE + 0xC20UL)) /* DDR PLL general status */ /* ---- MPIC - T2080RM 24.3 ---- */ #define PIC_BASE (CCSRBAR + 0x40000) @@ -277,31 +278,30 @@ enum ifc_amask_sizes { #define DDR_REF_RATE_PS 7800000 #else /* T2080 RDB: DDR3L SODIMM */ -/* TODO: Fill SPD parameters from DDR3L SODIMM datasheet */ -#define DDR_N_RANKS 2 /* TODO: confirm from CS_CONFIG dump */ -#define DDR_RANK_DENS 0x100000000 /* TODO: confirm */ +#define DDR_N_RANKS 2 +#define DDR_RANK_DENS 0x80000000 #define DDR_SDRAM_WIDTH 64 #define DDR_EC_SDRAM_W 8 -#define DDR_N_ROW_ADDR 16 /* TODO: confirm */ -#define DDR_N_COL_ADDR 10 /* TODO: confirm */ +#define DDR_N_ROW_ADDR 15 +#define DDR_N_COL_ADDR 10 #define DDR_N_BANKS 8 #define DDR_EDC_CONFIG 2 #define DDR_BURSTL_MASK 0x0c -#define DDR_TCKMIN_X_PS 1500 /* TODO: from DDR3L datasheet */ -#define DDR_TCMMAX_PS 3000 /* TODO: from DDR3L datasheet */ -#define DDR_CASLAT_X 0x000007E0 /* TODO */ -#define DDR_TAA_PS 13500 /* TODO */ -#define DDR_TRCD_PS 13500 /* TODO */ -#define DDR_TRP_PS 13500 /* TODO */ -#define DDR_TRAS_PS 36000 /* TODO */ -#define DDR_TRC_PS 49500 /* TODO */ -#define DDR_TFAW_PS 30000 /* TODO */ -#define DDR_TWR_PS 15000 /* TODO */ -#define DDR_TRFC_PS 260000 /* TODO */ -#define DDR_TRRD_PS 6000 /* TODO */ -#define DDR_TWTR_PS 7500 /* TODO */ -#define DDR_TRTP_PS 7500 /* TODO */ -#define DDR_REF_RATE_PS 7800000 /* TODO */ +#define DDR_TCKMIN_X_PS 1125 +#define DDR_TCMMAX_PS 3000 +#define DDR_CASLAT_X 0x000002FC +#define DDR_TAA_PS 13125 +#define DDR_TRCD_PS 13125 +#define DDR_TRP_PS 13125 +#define DDR_TRAS_PS 34000 +#define DDR_TRC_PS 47125 +#define DDR_TFAW_PS 27000 +#define DDR_TWR_PS 15000 +#define DDR_TRFC_PS 160000 +#define DDR_TRRD_PS 5000 +#define DDR_TWTR_PS 7500 +#define DDR_TRTP_PS 7500 +#define DDR_REF_RATE_PS 7800000 #endif #ifdef BOARD_NAII_68PPC2 @@ -326,6 +326,12 @@ enum ifc_amask_sizes { #define DDR_SDRAM_MODE_VAL 0x00441C70 #define DDR_SDRAM_MODE_2_VAL 0x00980000 #define DDR_SDRAM_MODE_3_8_VAL 0x00000000 +#define DDR_SDRAM_MODE_3_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_4_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_5_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_6_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_7_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_8_VAL DDR_SDRAM_MODE_3_8_VAL #define DDR_SDRAM_MD_CNTL_VAL 0x00000000 #define DDR_SDRAM_CFG_VAL 0xE7040000 @@ -353,10 +359,10 @@ enum ifc_amask_sizes { /* CW VPX3-152: DDR register values from U-Boot hardware dump */ #define DDR_CS0_BNDS_VAL 0x000000FF /* CS0: 0-4GB (4GB rank) */ #define DDR_CS1_BNDS_VAL 0x00000000 /* CS1: disabled */ -#define DDR_CS2_BNDS_VAL 0x00000000 -#define DDR_CS3_BNDS_VAL 0x00000000 #define DDR_CS0_CONFIG_VAL 0x80014402 /* CS0 enabled */ #define DDR_CS1_CONFIG_VAL 0x00014402 /* CS1 disabled (bit31=0) */ +#define DDR_CS2_BNDS_VAL 0x00000000 +#define DDR_CS3_BNDS_VAL 0x00000000 #define DDR_CS2_CONFIG_VAL 0x00000000 #define DDR_CS3_CONFIG_VAL 0x00000000 #define DDR_CS_CONFIG_2_VAL 0x00000000 @@ -371,6 +377,12 @@ enum ifc_amask_sizes { #define DDR_SDRAM_MODE_VAL 0x00461014 #define DDR_SDRAM_MODE_2_VAL 0x00A00000 #define DDR_SDRAM_MODE_3_8_VAL 0x00000000 +#define DDR_SDRAM_MODE_3_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_4_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_5_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_6_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_7_VAL DDR_SDRAM_MODE_3_8_VAL +#define DDR_SDRAM_MODE_8_VAL DDR_SDRAM_MODE_3_8_VAL #define DDR_SDRAM_MD_CNTL_VAL 0x00000000 #define DDR_SDRAM_CFG_VAL 0xE7240000 /* MEM_EN|SREN|ECC_EN, DDR3 */ @@ -395,55 +407,59 @@ enum ifc_amask_sizes { #define DDR_ERR_INT_EN_VAL 0x0000001D #define DDR_ERR_SBE_VAL 0x00010000 #else -/* T2080 RDB: DDR register values */ -/* TODO: Fill ALL values from Phase 1 U-Boot register dump: +/* T2080 RDB DDR register values from U-Boot register dump. + * T2080 RDB (default CCSRBAR = 0xFE000000, DDR_BASE = 0xFE008000): * md.l 0xfe008000 4; md.l 0xfe008010 4 (CS BNDS) * md.l 0xfe008080 4; md.l 0xfe0080c0 4 (CS CONFIG) * md.l 0xfe008100 4; md.l 0xfe008160 3 (TIMING) * md.l 0xfe008110 8; md.l 0xfe008130 1 (CONFIG/MODE/CLK) * md.l 0xfe008170 3; md.l 0xfe008190 2 (WRLVL) * md.l 0xfe008200 6; md.l 0xfe008b28 2 (MODE3-8/CDR) */ -#define DDR_CS0_BNDS_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS1_BNDS_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS2_BNDS_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS3_BNDS_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS0_CONFIG_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS1_CONFIG_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS2_CONFIG_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS3_CONFIG_VAL 0x00000000 /* TODO: from dump */ -#define DDR_CS_CONFIG_2_VAL 0x00000000 /* TODO: from dump */ - -#define DDR_TIMING_CFG_3_VAL 0x00000000 /* TODO: from dump */ -#define DDR_TIMING_CFG_0_VAL 0x00000000 /* TODO: from dump */ -#define DDR_TIMING_CFG_1_VAL 0x00000000 /* TODO: from dump */ -#define DDR_TIMING_CFG_2_VAL 0x00000000 /* TODO: from dump */ -#define DDR_TIMING_CFG_4_VAL 0x00000000 /* TODO: from dump */ -#define DDR_TIMING_CFG_5_VAL 0x00000000 /* TODO: from dump */ - -#define DDR_SDRAM_MODE_VAL 0x00000000 /* TODO: from dump */ -#define DDR_SDRAM_MODE_2_VAL 0x00000000 /* TODO: from dump */ -#define DDR_SDRAM_MODE_3_8_VAL 0x00000000 /* TODO: from dump */ -#define DDR_SDRAM_MD_CNTL_VAL 0x00000000 /* TODO: from dump */ - -#define DDR_SDRAM_CFG_VAL 0x00000000 /* TODO: from dump */ -#define DDR_SDRAM_CFG_2_VAL 0x00000000 /* TODO: from dump */ - -#define DDR_SDRAM_INTERVAL_VAL 0x00000000 /* TODO: from dump */ +#define DDR_CS0_BNDS_VAL 0x000000FF +#define DDR_CS1_BNDS_VAL 0x000000FF +#define DDR_CS2_BNDS_VAL 0x00000000 +#define DDR_CS3_BNDS_VAL 0x00000000 +#define DDR_CS0_CONFIG_VAL 0x80044302 +#define DDR_CS1_CONFIG_VAL 0x80004302 +#define DDR_CS2_CONFIG_VAL 0x00000000 +#define DDR_CS3_CONFIG_VAL 0x00000000 +#define DDR_CS_CONFIG_2_VAL 0x00000000 + +#define DDR_TIMING_CFG_3_VAL 0x02081000 +#define DDR_TIMING_CFG_0_VAL 0x9011000E +#define DDR_TIMING_CFG_1_VAL 0xD0D8EE57 +#define DDR_TIMING_CFG_2_VAL 0x0048E15A +#define DDR_TIMING_CFG_4_VAL 0x00000001 +#define DDR_TIMING_CFG_5_VAL 0x05401400 + +#define DDR_SDRAM_MODE_VAL 0x00441E14 +#define DDR_SDRAM_MODE_2_VAL 0x00A00000 +#define DDR_SDRAM_MODE_3_VAL 0x00001E14 +#define DDR_SDRAM_MODE_4_VAL 0x00A00000 +#define DDR_SDRAM_MODE_5_VAL 0x00001E14 +#define DDR_SDRAM_MODE_6_VAL 0x00A00000 +#define DDR_SDRAM_MODE_7_VAL 0x00001E14 +#define DDR_SDRAM_MODE_8_VAL 0x00A00000 +#define DDR_SDRAM_MD_CNTL_VAL 0x00000000 + +#define DDR_SDRAM_CFG_VAL 0xE7044000 +#define DDR_SDRAM_CFG_2_VAL 0x00401100 + +#define DDR_SDRAM_INTERVAL_VAL 0x0E38038E #define DDR_DATA_INIT_VAL 0xDEADBEEF -#define DDR_SDRAM_CLK_CNTL_VAL 0x00000000 /* TODO: from dump */ -#define DDR_ZQ_CNTL_VAL 0x00000000 /* TODO: from dump */ +#define DDR_SDRAM_CLK_CNTL_VAL 0x02800000 +#define DDR_ZQ_CNTL_VAL 0x89080600 -/* Write leveling - CRITICAL: board-specific values from U-Boot. - * These depend on PCB trace lengths and MUST come from the register dump. */ -#define DDR_WRLVL_CNTL_VAL 0x00000000 /* TODO: from dump */ -#define DDR_WRLVL_CNTL_2_VAL 0x00000000 /* TODO: from dump */ -#define DDR_WRLVL_CNTL_3_VAL 0x00000000 /* TODO: from dump */ +/* Write leveling - board-specific values from U-Boot register dump */ +#define DDR_WRLVL_CNTL_VAL 0x8675F607 +#define DDR_WRLVL_CNTL_2_VAL 0x0808080C +#define DDR_WRLVL_CNTL_3_VAL 0x0B0C0C09 -#define DDR_SDRAM_RCW_1_VAL 0x00000000 /* TODO: from dump */ -#define DDR_SDRAM_RCW_2_VAL 0x00000000 /* TODO: from dump */ +#define DDR_SDRAM_RCW_1_VAL 0x00000000 +#define DDR_SDRAM_RCW_2_VAL 0x00000000 -#define DDR_DDRCDR_1_VAL 0x00000000 /* TODO: from dump */ -#define DDR_DDRCDR_2_VAL 0x00000000 /* TODO: from dump */ +#define DDR_DDRCDR_1_VAL 0x80040000 +#define DDR_DDRCDR_2_VAL 0x00000001 #define DDR_ERR_INT_EN_VAL 0x0000001D #define DDR_ERR_SBE_VAL 0x00010000 diff --git a/include/fdt.h b/include/fdt.h index ff0fb44991..4962249b7f 100644 --- a/include/fdt.h +++ b/include/fdt.h @@ -156,6 +156,7 @@ int fdt_fixup_val(void* fdt, int off, const char* node, const char* name, uint32 int fdt_fixup_val64(void* fdt, int off, const char* node, const char* name, uint64_t val); int fdt_shrink(void* fdt); +int fdt_add_mem_rsv(void* fdt, uint64_t address, uint64_t size); /* FIT */ const char* fit_find_images(void* fdt, const char** pkernel, const char** pflat_dt); diff --git a/include/image.h b/include/image.h index 69b8dc83e1..ba3e4fb220 100644 --- a/include/image.h +++ b/include/image.h @@ -1474,7 +1474,11 @@ static inline int wb_flash_write_verify_word(struct wolfBoot_image *img, /* -- Image Formats -- */ /* Legacy U-Boot Image */ +#ifdef BIG_ENDIAN_ORDER +#define UBOOT_IMG_HDR_MAGIC 0x27051956UL +#else #define UBOOT_IMG_HDR_MAGIC 0x56190527UL +#endif #define UBOOT_IMG_HDR_SZ 64 /* --- Flattened Device Tree Blob */ diff --git a/src/boot_ppc.c b/src/boot_ppc.c index b1608aef99..5b5fbbeff9 100644 --- a/src/boot_ppc.c +++ b/src/boot_ppc.c @@ -25,6 +25,9 @@ #include "image.h" #include "loader.h" #include "wolfboot/wolfboot.h" +#ifdef MMU +#include "fdt.h" +#endif extern unsigned int __bss_start__; extern unsigned int __bss_end__; @@ -229,6 +232,219 @@ void flush_cache(uint32_t start_addr, uint32_t size) } #endif +#ifdef ENABLE_OS64BIT +/* Transition to 64-bit memory map for VxWorks 7 / Linux 64-bit. + * Equivalent to CW U-Boot's "ossel ostype2" bootm transition. + * Sets up 36-bit physical addressing per FUM Table 2.5 (64-bit Memory Map). + * Must run from RAMFUNCTION (DDR) since we modify flash/CCSR LAW and TLB. + * + * Key changes from 32-bit intermediary map (Table 2.6): + * - EA 0x80000000 → PA 0xC_00000000: DDR upper 2GB (was PCIe in intermediary) + * - All peripheral LAWs use BARH=0xF (36-bit physical addressing) + * - TLB MAS7 matches LAW BARH for flash and CCSR */ +static void RAMFUNCTION hal_os64bit_map_transition(void) +{ + /* Add peripheral LAWs needed by 64-bit OS (VxWorks 7, Linux 64-bit). + * CCSR, flash, and DDR lower LAWs are already set from boot assembly + * with CCSRBAR_PHYS_HIGH=0xF and FLASH_BASE_PHYS_HIGH=0xF. + * The CW U-Boot cw_late_mmap_adjust() only adjusts PCI LAWs/ATMUs; + * all other LAWs are set during U-Boot board init. wolfBoot adds the + * equivalent LAWs here. */ + + /* LAW table matching CW U-Boot law_table_lnx_vx7 (law.c). + * Flash (LAW0/1) and CCSR LAWs are set from boot assembly. + * CPLD LAW is set by hal_cpld_init. Add the remaining entries. */ + + /* FPGA 32-bit registers 2MB at PA 0xF_EE400000 */ + set_law(6, 0xF, 0xEE400000, LAW_TRGT_IFC, LAW_SIZE_2MB, 1); + /* FPGA 8-bit registers 512KB at PA 0xF_EE600000 */ + set_law(7, 0xF, 0xEE600000, LAW_TRGT_IFC, LAW_SIZE_512KB, 1); + /* NVRAM 512KB at PA 0xF_EE700000 */ + set_law(8, 0xF, 0xEE700000, LAW_TRGT_IFC, LAW_SIZE_512KB, 1); + /* BMan portals 32MB at PA 0xF_EA000000 */ + set_law(9, 0xF, 0xEA000000, LAW_TRGT_BMAN, LAW_SIZE_32MB, 1); + /* QMan portals 32MB at PA 0xF_EC000000 */ + set_law(10, 0xF, 0xEC000000, LAW_TRGT_QMAN, LAW_SIZE_32MB, 1); + /* DCSR 4MB at PA 0xF_EE000000 */ + set_law(11, 0xF, 0xEE000000, LAW_TRGT_DCSR, LAW_SIZE_4MB, 1); + /* Note: DDR LAW is now 4GB at PA 0x0 (set in hal_ddr_init). + * No separate upper DDR LAW needed — the 4GB LAW covers all DDR. + * + * PCIe LAWs match CW VPX3-152 FUM Table 2.5 (ostype2 final 64-bit + * map, used by VxWorks 7 64-bit). Production CW U-Boot's + * cw_late_mmap_adjust() converts the intermediary 32-bit map + * (Table 2.6, 512MB/1GB PCIe windows at low PA upper) to this + * final layout when bootm runs. wolfBoot needs the same final + * layout at handoff. */ + /* PCIe4 (Switch) memory 2GB at PA 0xD_00000000, EA=0xC0000000 */ + set_law(13, 0xD, 0x00000000, LAW_TRGT_PCIE4, LAW_SIZE_2GB, 1); + /* PCIe4 I/O 256KB at PA 0xF_EE800000 */ + set_law(14, 0xF, 0xEE800000, LAW_TRGT_PCIE4, LAW_SIZE_256KB, 1); + /* PCIe1 (XMC) memory 256MB at PA 0xF_E0000000, EA=0xE0000000. + * (Table 2.5 lists 160MB but power-of-two LAW SIZE rounds to 256MB.) */ + set_law(15, 0xF, 0xE0000000, LAW_TRGT_PCIE1, LAW_SIZE_256MB, 1); + /* PCIe1 I/O 256KB at PA 0xF_EE840000 */ + set_law(16, 0xF, 0xEE840000, LAW_TRGT_PCIE1, LAW_SIZE_256KB, 1); + + /* TLB entries matching CW U-Boot tlb_table_lnx_vx7 (tlb.c). + * CCSR (Entry 1) and Flash (Entry 2) are already set from boot + * assembly with correct MAS7 = PHYS_HIGH. Add the remaining + * peripheral TLBs that VxWorks needs during early init. */ + + /* TLB entries for peripheral access by the OS. + * IMPORTANT: Avoid ESELs used by wolfBoot: + * 0 = boot ROM (invalidated on VPX3), 1 = CCSR, 2 = flash, + * 9 = .ramcode (CPC SRAM / DDR), 12 = DDR lower 2GB. + * Use ESELs 3-8, 10-11, 13+ for new entries. */ + + /* BMan portals: EA 0xEA000000 → PA 0xF_EA000000, 32MB */ + set_tlb(1, 3, 0xEA000000, 0xEA000000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, 0, 0, + BOOKE_PAGESZ_32M, 1); + /* QMan portals: EA 0xEC000000 → PA 0xF_EC000000, 32MB */ + set_tlb(1, 4, 0xEC000000, 0xEC000000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, 0, 0, + BOOKE_PAGESZ_32M, 1); + /* DCSR: EA 0xEE000000 → PA 0xF_EE000000, 4MB */ + set_tlb(1, 5, 0xEE000000, 0xEE000000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_4M, 1); + /* FPGA/NVRAM: EA 0xEE400000 → PA 0xF_EE400000, 4MB */ + set_tlb(1, 6, 0xEE400000, 0xEE400000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_4M, 1); + /* PCIe4 (Switch) memory: EA 0xC0000000 → PA 0xD_00000000, 2GB + * (Table 2.5 final ostype2 64-bit map, post-cw_late_mmap_adjust). + * Was previously labelled "PCIe1 EA=0xC0000000" -- but per the + * CW FUM, EA=0xC0000000 is PCIe4 (Switch), not PCIe1. */ + set_tlb(1, 7, 0xC0000000, 0x00000000, 0xD, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_2G, 1); + /* PCIe1 (XMC) memory: EA 0xE0000000 → PA 0xF_E0000000, 256MB. + * Table 2.5 lists 160MB; round up to 256MB for the power-of-two + * page-size encoding. */ + set_tlb(1, 8, 0xE0000000, 0xE0000000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_256M, 1); + /* Note: EA 0x80000000 -> PA 0xC_00000000 is RAM space (DDR upper 2GB) + * in the FUM Table 2.5 final 64-bit map; VxWorks creates that mapping + * itself, so we don't add it here. */ + /* PCIe1 I/O: EA 0xEE840000 → PA 0xF_EE840000, 256KB */ + set_tlb(1, 10, 0xEE840000, 0xEE840000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_256K, 1); + /* PCIe4 I/O: EA 0xEE800000 → PA 0xF_EE800000, 256KB */ + set_tlb(1, 11, 0xEE800000, 0xEE800000, 0xF, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0, + BOOKE_PAGESZ_256K, 1); + + /* Move DDR mapping from TLB1 slot 12 to slot 0 for VxWorks compat. + * VxWorks 7's early entry stub iterates TLB1 from slot 1 up to NTLB + * invalidating each entry, then reads slot 0 expecting it to contain + * the DDR mapping (uses MAS2/MAS3 from slot 0 to compute PA-from-VA). + * U-Boot ends up with DDR at slot 0 by accident of find_free_tlbcam(); + * wolfBoot pins DDR at slot 12 (boot_ppc_start.S), which gets wiped + * by VxWorks's loop, leaving slot 0 invalid. Result: VxWorks reads + * garbage MAS regs and silently fails after the OS jump. + * + * Two TLB entries cannot map the same EA range (multi-hit -> machine + * check), so invalidate slot 12 BEFORE writing slot 0. + * + * Safe to do here because hal_os64bit_map_transition is RAMFUNCTION + * (executes from .ramcode at TLB1 slot 9, EA 0xEE900000) — DDR access + * is not required during these few instructions, only on return. */ + { + /* Invalidate slot 12 (DDR). MAS1 V=0, IPROT=0. */ + mtspr(MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(12)); + mtspr(MAS1, 0); + mtspr(MAS2, 0); + mtspr(MAS3, 0); + mtspr(MAS7, 0); + __asm__ __volatile__("isync; tlbwe; isync; msync" ::: "memory"); + } + /* DDR at slot 0: EA 0 -> PA 0, 2GB, cached (MAS2_M). */ + set_tlb(1, 0, 0x00000000, 0x00000000, 0, + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_M, 0, + BOOKE_PAGESZ_2G, 1); + + __asm__ __volatile__("sync; isync"); +} + +/* Final OS-handoff trampoline. Lives in .ramcode (DDR) so the bctr to + * the OS entry executes from DDR -- matching production U-Boot's + * pattern (U-Boot relocates itself to DDR before bootm). + * + * Steps: + * 1. Relocate the exception handler (isr_empty) from flash to DDR + * and re-point IVPR. wolfBoot is XIP from flash; the handler + * lives at flash 0xFFFE0000. Once the next step puts flash in + * cache-inhibit + guarded, the e6500 fetcher cannot service the + * handler instruction stream and any exception silent-hangs. + * Copying the handler to DDR and re-pointing IVPR there fixes + * this and is what U-Boot effectively does (its IVPR points to + * its DDR-relocated code, not flash). + * 2. Call hal_flash_cache_disable_pre_os to switch the flash TLB to + * MAS2_I|MAS2_G (matches the OS's pre-init state). + * 3. Drain the UART, sync the pipeline, indirect-jump to entry. */ +typedef void (*os64bit_entry_t)(uintptr_t r3, uintptr_t r4, uintptr_t r5, + uintptr_t r6, uintptr_t r7, uintptr_t r8, uintptr_t r9); + +/* Address of the DDR-relocated exception handler. Must be 4KB-aligned + * for IVPR and well above the kernel-image load region (typical OS + * load is 0x100000-0x700000). */ +#ifndef WOLFBOOT_OS64BIT_IVPR_DDR +#define WOLFBOOT_OS64BIT_IVPR_DDR 0x00800000UL +#endif + +void RAMFUNCTION wolfBoot_os64bit_jump(os64bit_entry_t entry, + uintptr_t r3, uintptr_t r6, uintptr_t r7) +{ + extern void hal_flash_cache_disable_pre_os(void); + extern unsigned int isr_empty; + extern unsigned int isr_empty_end; + { + volatile uint32_t *src = (volatile uint32_t *)&isr_empty; + volatile uint32_t *dst = (volatile uint32_t *) + WOLFBOOT_OS64BIT_IVPR_DDR; + uintptr_t bytes = (uintptr_t)&isr_empty_end - + (uintptr_t)&isr_empty; + /* Word count, rounded up. Cache-line range, rounded up to + * 64-byte lines (one e6500 dcbz/dcbst granule). */ + uintptr_t copy_words = (bytes + sizeof(uint32_t) - 1U) / + sizeof(uint32_t); + uintptr_t cache_words = ((copy_words + 15U) / 16U) * 16U; + uintptr_t i; + + /* Reserve enough headroom in the IVPR landing zone for the + * handler to grow without silently truncating the copy. */ + #define WOLFBOOT_OS64BIT_IVPR_MAX 0x400U /* 1 KB */ + if (bytes > WOLFBOOT_OS64BIT_IVPR_MAX) { + return; /* refuse to jump with a truncated handler */ + } + + for (i = 0; i < copy_words; i++) { + dst[i] = src[i]; + } + /* Make the DDR copy visible to the I-cache fetcher. */ + for (i = 0; i < cache_words; i += 16U) { /* 64 bytes per line */ + __asm__ __volatile__("dcbst 0,%0" :: "r"(&dst[i]) : "memory"); + } + __asm__ __volatile__("sync" ::: "memory"); + for (i = 0; i < cache_words; i += 16U) { + __asm__ __volatile__("icbi 0,%0" :: "r"(&dst[i]) : "memory"); + } + __asm__ __volatile__("sync; isync" ::: "memory"); + /* IVOR offsets stay at 0 (set during early reset) so every + * interrupt type lands at this single handler. */ + mtspr(IVPR, (uint32_t)dst); + __asm__ __volatile__("isync" ::: "memory"); + } + hal_flash_cache_disable_pre_os(); + __asm__ __volatile__("sync; isync" ::: "memory"); + entry(r3, 0, 0, r6, r7, 0, 0); +} +#endif /* ENABLE_OS64BIT */ + #ifdef MMU void do_boot(const uint32_t *app_offset, const uint32_t* dts_offset) #else @@ -246,19 +462,355 @@ void do_boot(const uint32_t *app_offset) hal_dts_fixup((uint32_t*)dts_offset); #endif -#ifndef BUILD_LOADER_STAGE1 - /* invalidate cache */ - flush_cache((uint32_t)app_offset, L1_CACHE_SZ); +#if defined(DEBUG_UART) && defined(WOLFBOOT_ARCH_PPC) + wolfBoot_printf("do_boot: entry=%p, dts=%p\n", + app_offset, + #ifdef MMU + dts_offset + #else + (void*)0 + #endif + ); +#endif - /* Disable all async interrupts */ - msr = mfmsr(); - msr &= ~(MSR_CE | MSR_ME | MSR_DE); +#ifndef BUILD_LOADER_STAGE1 + /* Flush the entire OS image region from D-cache before jump. PowerPC + * I/D caches are not coherent — without dcbst over the whole image, + * the I-cache may fetch stale instructions for any portion that was + * touched by the loader/signature verification but not yet written + * back. Previous code only flushed L1_CACHE_SZ (32 KB), far too small + * for a 6+ MB VxWorks kernel — likely cause of silent jump failure. */ + flush_cache((uint32_t)app_offset, WOLFBOOT_PARTITION_SIZE); + + /* Set MSR to match U-Boot's pre-VxWorks state: 0x2200 + * FP(bit13) + DE(bit9) enabled. All others cleared. + * U-Boot passes MSR=0x2200 when jumping to VxWorks. */ + msr = 0x00002200; mtmsr(msr); #endif +#if defined(DEBUG_UART) && defined(WOLFBOOT_ARCH_PPC) + wolfBoot_printf("do_boot: pre-transition\n"); +#endif + +#if defined(MMU) && !defined(BUILD_LOADER_STAGE1) + /* Flush FDT from D-cache to DDR so the OS sees all fixup changes. + * Must be done after hal_dts_fixup and before entry(). Stage1 + * doesn't run hal_dts_fixup, so the flush isn't needed (and + * flush_cache itself is excluded from stage1). */ + flush_cache((uint32_t)dts_offset, WOLFBOOT_DTS_MAX_SIZE); +#endif + +#ifdef ENABLE_OS64BIT + /* Transition LAWs and TLBs to 64-bit physical addressing. */ + hal_os64bit_map_transition(); +#endif + +#if defined(DEBUG_UART) && defined(WOLFBOOT_ARCH_PPC) && \ + defined(WOLFBOOT_PPC_PRE_OS_DUMP) + /* Comprehensive pre-OS state dump (SPRs, CCSR, LAWs, TLB1, FDT, IFC, + * DDR, DUART, spin-table, ePAPR args, kernel-entry bytes). Useful + * for diff'ing wolfBoot's pre-jump state against a parallel dump + * from a known-working U-Boot bootm path when porting a new OS. + * Opt-in via -DWOLFBOOT_PPC_PRE_OS_DUMP -- the DUART DLAB toggle + * inside the dump can interfere with the active console and was + * observed to keep VxWorks 7 silent on CW VPX3-152 when enabled. */ + { + uint32_t _spr_pir = mfspr(SPRN_PIR); + uint32_t _spr_pvr = mfspr(SPRN_PVR); + uint32_t _spr_svr = mfspr(SPRN_SVR); + uint32_t _spr_hid0 = mfspr(SPRN_HID0); + uint32_t _spr_bucsr = mfspr(SPRN_BUCSR); + uint32_t _spr_hdbcr0 = mfspr(SPRN_HDBCR0); + uint32_t _spr_hdbcr1 = mfspr(SPRN_HDBCR1); + uint32_t _spr_hdbcr2 = mfspr(SPRN_HDBCR2); + uint32_t _spr_hdbcr7 = mfspr(SPRN_HDBCR7); + uint32_t _spr_l1csr0 = mfspr(L1CSR0); + uint32_t _spr_l1csr1 = mfspr(L1CSR1); + /* L2CSR0/1 (SPR 1017/1018) faults in U-Boot's MSR context with + * a hang/checkstop -- skip in both bootloaders for parity. */ + uint32_t _spr_tcr = mfspr(SPRN_TCR); + uint32_t _spr_tsr = mfspr(SPRN_TSR); + uint32_t _spr_esr = mfspr(SPRN_ESR); + uint32_t _spr_mcsr = mfspr(SPRN_MCSR); + uint32_t _spr_dbcr0 = mfspr(SPRN_DBCR0); + uint32_t _spr_dbsr = mfspr(SPRN_DBSR); + uint32_t _spr_tlb1cfg = mfspr(SPRN_TLB1CFG); + uint32_t _spr_mmucfg = mfspr(SPRN_MMUCFG); + /* Extended SPR set: MAS4, debug compares, decrementer, TB, PIDs, + * SPRGs, LR/CTR/XER/CR. NOTE: HID1 (1009), EPCR (307), MMUCSR0 + * (1012), PID1 (633), PID2 (634) and DECAR (54) all fault here + * even after hal_os64bit_map_transition's MSR clear (likely + * MSR[GS]/[CM] residuals or hypervisor-only access on e6500), + * so they're omitted to keep the dump path alive. */ + uint32_t _spr_mas4 = mfspr(0x274); + uint32_t _spr_dbcr1 = mfspr(SPRN_DBCR1); + uint32_t _spr_dbcr2 = mfspr(0x136); + uint32_t _spr_iac1 = mfspr(0x138); + uint32_t _spr_iac2 = mfspr(0x139); + uint32_t _spr_dac1 = mfspr(0x13C); + uint32_t _spr_dac2 = mfspr(0x13D); + uint32_t _spr_dec = mfspr(SPRN_DEC); + uint32_t _spr_tbl = mfspr(0x10C); + uint32_t _spr_tbu = mfspr(0x10D); + uint32_t _spr_pid0 = mfspr(SPRN_PID); + uint32_t _spr_sprg0 = mfspr(0x110); + uint32_t _spr_sprg1 = mfspr(0x111); + uint32_t _spr_sprg2 = mfspr(0x112); + uint32_t _spr_sprg3 = mfspr(0x113); + uint32_t _spr_sprg4 = mfspr(0x114); + uint32_t _spr_sprg5 = mfspr(0x115); + uint32_t _spr_sprg6 = mfspr(0x116); + uint32_t _spr_sprg7 = mfspr(0x117); + uint32_t _spr_lr = mfspr(0x008); + uint32_t _spr_ctr = mfspr(0x009); + uint32_t _spr_xer = mfspr(0x001); + uint32_t _spr_cr; + uint32_t _spr_r1; + __asm__ __volatile__("mfcr %0" : "=r"(_spr_cr)); + __asm__ __volatile__("mr %0, 1" : "=r"(_spr_r1)); + wolfBoot_printf("SPR PIR=%x PVR=%x SVR=%x HID0=%x BUCSR=%x\n", + _spr_pir, _spr_pvr, _spr_svr, _spr_hid0, _spr_bucsr); + wolfBoot_printf("SPR HDBCR0=%x HDBCR1=%x HDBCR2=%x HDBCR7=%x\n", + _spr_hdbcr0, _spr_hdbcr1, _spr_hdbcr2, _spr_hdbcr7); + wolfBoot_printf("SPR L1CSR0=%x L1CSR1=%x\n", + _spr_l1csr0, _spr_l1csr1); + wolfBoot_printf("SPR TCR=%x TSR=%x DEC=%x TBL=%x TBU=%x\n", + _spr_tcr, _spr_tsr, _spr_dec, _spr_tbl, _spr_tbu); + wolfBoot_printf("SPR ESR=%x MCSR=%x DBCR0=%x DBCR1=%x DBCR2=%x DBSR=%x\n", + _spr_esr, _spr_mcsr, _spr_dbcr0, _spr_dbcr1, _spr_dbcr2, _spr_dbsr); + wolfBoot_printf("SPR IAC1=%x IAC2=%x DAC1=%x DAC2=%x\n", + _spr_iac1, _spr_iac2, _spr_dac1, _spr_dac2); + wolfBoot_printf("SPR TLB1CFG=%x MMUCFG=%x MAS4=%x PID0=%x\n", + _spr_tlb1cfg, _spr_mmucfg, _spr_mas4, _spr_pid0); + wolfBoot_printf("SPR SPRG0=%x SPRG1=%x SPRG2=%x SPRG3=%x\n", + _spr_sprg0, _spr_sprg1, _spr_sprg2, _spr_sprg3); + wolfBoot_printf("SPR SPRG4=%x SPRG5=%x SPRG6=%x SPRG7=%x\n", + _spr_sprg4, _spr_sprg5, _spr_sprg6, _spr_sprg7); + wolfBoot_printf("SPR LR=%x CTR=%x XER=%x CR=%x R1=%x\n", + _spr_lr, _spr_ctr, _spr_xer, _spr_cr, _spr_r1); + /* CCSR snapshot — use raw offsets so we don't depend on + * nxp_t2080.h here (boot_ppc.c is generic across PPC targets). */ + wolfBoot_printf("CCSR LCC_BSTRH=%x BSTRL=%x BSTAR=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0x20)), + get32((volatile uint32_t*)(CCSRBAR + 0x24)), + get32((volatile uint32_t*)(CCSRBAR + 0x28))); + wolfBoot_printf("CCSR DCFG_BRR=%x RCPM_PCTBENR=%x PIC_WHOAMI=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0xE00E4)), /* DCFG_BRR */ + get32((volatile uint32_t*)(CCSRBAR + 0xE21A0)), /* RCPM_PCTBENR */ + get32((volatile uint32_t*)(CCSRBAR + 0x40090))); /* PIC_WHOAMI */ + wolfBoot_printf("CCSR DCFG_DEVDISR=%x DDR_CFG=%x DDR_CS0=%x DDR_CS1=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0xE0070)), /* DCFG_DEVDISR1 */ + get32((volatile uint32_t*)(CCSRBAR + 0x8110)), + get32((volatile uint32_t*)(CCSRBAR + 0x8000)), + get32((volatile uint32_t*)(CCSRBAR + 0x8008))); + } + + /* Dump final state AFTER transition + MSR clear */ + { + int _j; + uint32_t _msr2; + uint32_t lawar; + uint32_t mas1v, mas2v, mas3v, mas7v; + #ifdef MMU + const uint8_t *_fdt_p; + uint32_t _fdt_sz; + uint32_t _k; + #endif + __asm__ __volatile__("mfmsr %0" : "=r" (_msr2)); + wolfBoot_printf("FINAL MSR=0x%x\n", _msr2); + wolfBoot_printf("FINAL LAW:\n"); + for (_j = 0; _j < 20; _j++) { + lawar = get32(LAWAR(_j)); + if (lawar & LAWAR_ENABLE) { + wolfBoot_printf(" %d: H=%x L=%x W=%x\n", _j, + get32(LAWBARH(_j)), get32(LAWBARL(_j)), lawar); + } + } + /* Dump TLB1 (compare against U-Boot's pre-VxWorks TLB state) */ + wolfBoot_printf("FINAL TLB1:\n"); + for (_j = 0; _j < 64; _j++) { + mtspr(MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(_j)); + __asm__ __volatile__("tlbre; isync" ::: "memory"); + mas1v = mfspr(MAS1); + if ((mas1v & MAS1_VALID) == 0) + continue; + mas2v = mfspr(MAS2); + mas3v = mfspr(MAS3); + mas7v = mfspr(MAS7); + wolfBoot_printf(" %d: M1=%x M2=%x M3=%x M7=%x\n", + _j, mas1v, mas2v, mas3v, mas7v); + } + #ifdef MMU + /* FDT body dump removed -- byte-level diff vs U-Boot already + * verified the patched FDT matches U-Boot's structurally. Emit + * just a header line + size + END marker so existing test scripts + * can still grep for "FDT END" as their progress checkpoint. */ + _fdt_p = (const uint8_t *)dts_offset; + _fdt_sz = (uint32_t)fdt_totalsize(dts_offset); + wolfBoot_printf("FDT magic=%02x%02x%02x%02x size=%u\n", + _fdt_p[0], _fdt_p[1], _fdt_p[2], _fdt_p[3], _fdt_sz); + (void)_k; + wolfBoot_printf("=== FDT END ===\n"); + #endif + + /* Comprehensive IFC chip-select dump. CCSRBAR + 0x124000. + * Per T2080RM and U-Boot's struct fsl_ifc, all per-CS register + * blocks (CSPR/AMASK/CSOR) use stride 0xC, NOT 0x20: + * CSPR_EXT(n) @ 0xC + n*0xC, CSPR(n) @ 0x10 + n*0xC + * AMASK(n) @ 0xA0 + n*0xC + * CSOR(n) @ 0x130 + n*0xC, CSOR_EXT(n) @ 0x134 + n*0xC */ + wolfBoot_printf("IFC:\n"); + for (_j = 0; _j < 4; _j++) { + uint32_t cspr = get32((volatile uint32_t*) + (CCSRBAR + 0x124010 + _j*0xC)); + if (cspr & 1) { + wolfBoot_printf(" CS%d: E=%x C=%x A=%x SOR=%x SX=%x\n", _j, + get32((volatile uint32_t*)(CCSRBAR + 0x12400C + _j*0xC)), + cspr, + get32((volatile uint32_t*)(CCSRBAR + 0x1240A0 + _j*0xC)), + get32((volatile uint32_t*)(CCSRBAR + 0x124130 + _j*0xC)), + get32((volatile uint32_t*)(CCSRBAR + 0x124134 + _j*0xC))); + } + } + + /* DDR controller (CCSRBAR + 0x8000). T2080RM lists in offsets + * relative to DDR base. Dump the registers most likely to differ + * between U-Boot and wolfBoot for a 4GB ostype2 build. */ + wolfBoot_printf("DDR: CS0B=%x CS0C=%x CS1B=%x CS1C=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0x8000)), /* CS0_BNDS */ + get32((volatile uint32_t*)(CCSRBAR + 0x8080)), /* CS0_CONFIG */ + get32((volatile uint32_t*)(CCSRBAR + 0x8008)), /* CS1_BNDS */ + get32((volatile uint32_t*)(CCSRBAR + 0x8084)));/* CS1_CONFIG */ + /* T2080 DDR timing register offsets: + * TIMING_CFG_3 @ 0x100, _0 @ 0x104, _1 @ 0x108, _2 @ 0x10C */ + wolfBoot_printf("DDR: TC3=%x TC0=%x TC1=%x TC2=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0x8100)), + get32((volatile uint32_t*)(CCSRBAR + 0x8104)), + get32((volatile uint32_t*)(CCSRBAR + 0x8108)), + get32((volatile uint32_t*)(CCSRBAR + 0x810C))); + wolfBoot_printf("DDR: CFG=%x CFG2=%x MODE=%x INT=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0x8110)), /* SDRAM_CFG */ + get32((volatile uint32_t*)(CCSRBAR + 0x8114)), /* SDRAM_CFG_2 */ + get32((volatile uint32_t*)(CCSRBAR + 0x8118)), /* SDRAM_MODE */ + get32((volatile uint32_t*)(CCSRBAR + 0x8124)));/* SDRAM_INTERVAL */ + wolfBoot_printf("DDR: ERR_DET=%x ERR_DIS=%x DATA_INIT=%x\n", + get32((volatile uint32_t*)(CCSRBAR + 0x8E40)), /* ERR_DETECT */ + get32((volatile uint32_t*)(CCSRBAR + 0x8E44)), /* ERR_DISABLE */ + get32((volatile uint32_t*)(CCSRBAR + 0x8128)));/* SDRAM_DATA_INIT */ + + /* DUART block (NS16550-style). T2080: DUART1 @ +0x11C500, + * DUART2 @ +0x11D500 (per nxp_t2080.h UART_BASE stride). + * CRITICAL: don't toggle DLAB while printf is mid-flight on the + * SAME UART -- doing so makes printf write to DLL instead of THR + * and corrupts the baud rate / output. So: + * 1. Read non-destructive regs into locals + * 2. Print them (DLAB=0 so printf works normally) + * 3. Toggle DLAB, snapshot DLL/DLM into locals + * 4. Restore LCR (printf works again) + * 5. Print DLL/DLM from the saved values */ + { + int _u; + volatile uint8_t *base; + uint8_t lcr, mcr, lsr, msr_r, scr, dll, dlm; + for (_u = 0; _u < 2; _u++) { + base = (volatile uint8_t *)(CCSRBAR + 0x11C500 + _u * 0x1000); + lcr = base[3]; + mcr = base[4]; + lsr = base[5]; + msr_r = base[6]; + scr = base[7]; + wolfBoot_printf("DUART%d: LCR=%x MCR=%x LSR=%x MSR=%x SCR=%x", + _u + 1, lcr, mcr, lsr, msr_r, scr); + base[3] = lcr | 0x80; /* DLAB=1, capture divisor */ + dll = base[0]; + dlm = base[1]; + base[3] = lcr; /* restore -- printf safe now */ + wolfBoot_printf(" DLL=%x DLM=%x\n", dll, dlm); + } + } + + /* Spin-table dump. g_spin_table_ddr is set by hal_mp_init() to the + * DDR-resident copy used for cpu-release-addr in the FDT. Each entry + * is ENTRY_SIZE (64) bytes; we dump the first 32 bytes per core + * (covers ADDR_UPPER/LOWER, R3, R6/RESV, PIR). */ + { + extern uint32_t g_spin_table_ddr; + volatile uint32_t *_st; + int _c, _w; + wolfBoot_printf("SPIN_TABLE @ %x:\n", g_spin_table_ddr); + if (g_spin_table_ddr != 0) { + for (_c = 0; _c < 4; _c++) { + _st = (volatile uint32_t *) + (g_spin_table_ddr + _c * 64); + wolfBoot_printf(" C%d:", _c); + for (_w = 0; _w < 8; _w++) { + wolfBoot_printf(" %x", _st[_w]); + } + wolfBoot_printf("\n"); + } + } + } + } + + /* Args/jump-target summary line — what VxWorks will see at entry. + * U-Boot prints "## Starting vxWorks at 0x%lx, device tree at 0x%lx" + * — print the equivalent here for direct comparison. */ + wolfBoot_printf("JUMP: entry=%p dts=%p r6=%x r7=%x\n", + app_offset, + #ifdef MMU + dts_offset, + #else + (void*)0, + #endif + EPAPR_MAGIC, WOLFBOOT_BOOTMAPSZ); + /* Sanity dump of jump target -- first 256 bytes (4 lines x 64 bytes). + * Tag with K (kernel) prefix so byte-level diff against U-Boot capture + * is mechanical via the FDT-style parser. */ + { + const uint8_t *_pc = (const uint8_t *)app_offset; + uint32_t _kk; + wolfBoot_printf("=== PC BYTES (256 bytes from entry) ===\n"); + for (_kk = 0; _kk < 256; _kk++) { + if ((_kk & 0x1F) == 0) + wolfBoot_printf("\nK%05x:", _kk); + wolfBoot_printf(" %02x", _pc[_kk]); + } + wolfBoot_printf("\n=== PC END ===\n"); + } +#endif + +#if defined(DEBUG_UART) && defined(WOLFBOOT_ARCH_PPC) + wolfBoot_printf("do_boot: jumping\n"); +#endif + /* ePAPR (Embedded Power Architecture Platform Requirements) * https://elinux.org/images/c/cf/Power_ePAPR_APPROVED_v1.1.pdf - */ + * + * For ENABLE_OS64BIT (CW VPX3-152 VxWorks 7), use the RAMFUNCTION + * trampoline `wolfBoot_os64bit_jump` so the final cache-disable + + * indirect bctr execute from DDR rather than flash. wolfBoot is XIP + * from flash by default; production CW U-Boot relocates itself to + * DDR before bootm. Mirroring that placement removes one more + * difference at the OS-jump moment. */ +#if defined(ENABLE_OS64BIT) && defined(WOLFBOOT_ARCH_PPC) + wolfBoot_os64bit_jump((os64bit_entry_t)entry, + #ifdef MMU + (uintptr_t)dts_offset, + #else + 0, + #endif + EPAPR_MAGIC, + WOLFBOOT_BOOTMAPSZ); +#else +#if defined(WOLFBOOT_ARCH_PPC) && defined(BOARD_CW_VPX3152) + /* Non-OS64BIT path: flash-cache-disable + entry from flash. */ + { + extern void hal_flash_cache_disable_pre_os(void); + hal_flash_cache_disable_pre_os(); + } +#endif + entry( #ifdef MMU (uintptr_t)dts_offset, /* r3 = dts address */ @@ -267,9 +819,10 @@ void do_boot(const uint32_t *app_offset) #endif 0, 0, EPAPR_MAGIC, /* r6 = ePAPR magic */ - WOLFBOOT_PARTITION_SIZE, /* r7 = Size of Initial Mapped Area (IMA) */ + WOLFBOOT_BOOTMAPSZ, /* r7 = Size of Initial Mapped Area (IMA) */ 0, 0 ); +#endif } void arch_reboot(void) diff --git a/src/boot_ppc_mp.S b/src/boot_ppc_mp.S index 1473127b09..bd9fe1094e 100644 --- a/src/boot_ppc_mp.S +++ b/src/boot_ppc_mp.S @@ -27,6 +27,21 @@ #define TORESET(x) (x - _secondary_start_page + BOOT_ROM_ADDR) +/* e6500 has 64-bit GPRs. When loading 32-bit addresses with bit 31 set + * (addresses >= 0x80000000), the lis instruction sign-extends, putting + * 0xFFFFFFFF in the upper 32 bits. This causes memory access failures. + * Use LOAD_ADDR32 macro to properly load 32-bit addresses on e6500. */ +#ifdef CORE_E6500 +#define LOAD_ADDR32(reg, addr) \ + li reg, 0; \ + oris reg, reg, (addr)@h; \ + ori reg, reg, (addr)@l +#else +#define LOAD_ADDR32(reg, addr) \ + lis reg, (addr)@h; \ + ori reg, reg, (addr)@l +#endif + /* Additional cores (mp) assembly code for core minimum startup and spin table. * All code must fit in 4KB, which gets virtually mapped via the TLB1 (MMU) and * loaded by core 0. Spin table entry TLB1(0) mapped for work is 64MB. @@ -34,11 +49,69 @@ .section .bootmp, "ax" .globl _secondary_start_page _secondary_start_page: - /* Time base, MAS7 and machine check pin enable */ - lis r0, (HID0_EMCP | HID0_TBEN | HID0_ENMAS7)@h - ori r0, r0, (HID0_EMCP | HID0_TBEN | HID0_ENMAS7)@l + /* Time base, MAS7 and machine check pin enable. + * HID0_EMCP=0x80000000 has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r0, (HID0_EMCP | HID0_TBEN | HID0_ENMAS7)) mtspr SPRN_HID0, r0 +#ifdef TARGET_nxp_t2080 + /* T2080 rev-1 e6500 erratum workarounds — must match primary + * (boot_ppc_start.S _reset). Mirrors CW U-Boot release.S + * CONFIG_SYS_FSL_ERRATUM_* set. Without these, secondary cores + * may exhibit cache speculation / DVA-stale / coherency bugs. + * VPX3-152 ships rev 1.1 (PVR 0x85380011). */ + mfspr r3, SPRN_PVR + rlwinm r3, r3, 28, 0xf /* major_rev */ + cmpwi r3, 0x1 + bne mp_t2080_errata_done + + /* A004779 — HDBCR2[32]=1, [33:39]=3, [41:47]=3 */ + msync + isync + mfspr r3, SPRN_HDBCR2 + lis r4, 0xff7f + andc r3, r3, r4 + oris r3, r3, 0x8303 + mtspr SPRN_HDBCR2, r3 + isync + + /* A004786 — HDBCR7 |= 0x80000000 */ + msync + isync + mfspr r3, SPRN_HDBCR7 + oris r3, r3, 0x8000 + mtspr SPRN_HDBCR7, r3 + isync + + /* A004792 — HDBCR0 |= 0x4000 */ + msync + isync + mfspr r3, SPRN_HDBCR0 + ori r3, r3, 0x4000 + mtspr SPRN_HDBCR0, r3 + isync + + /* A004806 — HDBCR7 |= 0x20000000 (DVA stale-data fix) */ + msync + isync + mfspr r3, SPRN_HDBCR7 + oris r3, r3, 0x2000 + mtspr SPRN_HDBCR7, r3 + isync + + /* A004809 — HDBCR0 |= 0x01008000 */ + msync + isync + mfspr r3, SPRN_HDBCR0 + oris r3, r3, 0x0100 + ori r3, r3, 0x8000 + mtspr SPRN_HDBCR0, r3 + isync + +mp_t2080_errata_done: + /* A003999 NOT applied -- see boot_ppc_start.S for rationale. */ +#endif /* TARGET_nxp_t2080 */ + #ifdef CORE_E500 /* Set addr streaming & broadcast * and optimized sync instruction (if rev 5.0 or greater) */ @@ -98,9 +171,10 @@ branch_prediction: andi. r1, r3, L1CSR_CE@l beq 2b - /* Get our PIR to figure out our table entry */ - lis r3, TORESET(_spin_table_addr)@h - ori r3, r3, TORESET(_spin_table_addr)@l + /* Get our PIR to figure out our table entry. + * TORESET(...) resolves to address near BOOT_ROM_ADDR (0xFFFFF000), + * bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r3, TORESET(_spin_table_addr)) lwz r3, 0(r3) /* Use PIR to determine cluster/core for spin table base at r10 */ @@ -108,6 +182,8 @@ branch_prediction: #if defined(CORE_E5500) || defined(CORE_E6500) rlwinm r8, r0, 29, 0x03 /* r8 = core within cluster */ srwi r10, r0, 5 /* r10 = cluster */ + mr r9, r10 /* r9 = cluster (preserved across r10 reuse + * below as spin-table base) */ mulli r5, r10, CORES_PER_CLUSTER add r5, r5, r8 /* r5 = linear core ID */ @@ -119,7 +195,7 @@ branch_prediction: mr r4, r0 mr r5, r4 #endif - slwi r8, r5, 6 /* spin table is padded to 64 bytes */ + slwi r8, r5, 6 /* multiply by ENTRY_SIZE (64 bytes) */ /* use r10 for the spin table base address */ add r10, r3, r8 @@ -132,15 +208,51 @@ branch_prediction: mtspr L1CSR2, r8 #if defined(CORE_E6500) /* --- L2 E6500 --- */ - /* e6500 L2 is per-cluster (shared by all cores in the cluster). - * The primary core already invalidated and enabled L2 during boot. - * Secondary cores must NOT do L2FI (flash invalidate) — it discards - * ALL dirty L2 lines including the primary core's stack, return - * addresses, and cached code, causing the primary core to crash - * (typically SRR0=0 from corrupted return address). - * L1 stash ID (set above via L1CSR2 SPR) is per-core and sufficient. - * L2CSR1 (stash ID) is per-cluster and already set by core 0. - * No CCSR TLB mapping needed since we skip L2 register access. */ + /* e6500 L2 is per-cluster. Primary core already enabled cluster 0's + * L2. Secondary cores in cluster 0 (siblings to primary) MUST NOT + * L2FI -- it discards primary's dirty L2 lines (stack, return + * addresses, cached code) and crashes the primary (typically + * SRR0=0 from corrupted return address). + * + * Secondaries in clusters > 0 (no primary present) MUST do L2FI + * + enable, otherwise their cluster's L2 is left in unknown reset + * state and any cached access from those cores hits cold. U-Boot's + * release.S unconditionally L2FIs, but it's safe there because + * U-Boot has already jumped to the OS by the time secondaries + * release -- primary's wolfBoot stack/code is not in use anymore. + * wolfBoot releases secondaries during hal_mp_init before the OS + * jump, so we MUST gate L2FI on (cluster != primary's cluster). + * + * r9 holds the cluster ID (preserved before r10 was repurposed as + * the spin-table base). Primary boots on cluster 0, so skip L2 + * init when r9 == 0. Note: gating on r5 (linear core ID) would be + * wrong -- secondaries in cluster 0 have cluster=0 but + * linear core ID != 0 and would still hit L2FI. */ + cmpwi r9, 0 + beq l2_init_skip_e6500 + + msync + lis r2, (L2CSR0_L2FI | L2CSR0_L2LFC)@h + ori r2, r2, (L2CSR0_L2FI | L2CSR0_L2LFC)@l + mtspr L2CSR0, r2 +l2_poll_invclear_e6500: + mfspr r3, L2CSR0 + and. r1, r3, r2 + bne l2_poll_invclear_e6500 + + /* Set L2 stash ID = (coreID * 2) + 32 + L2 (1) on this cluster. */ + addi r3, r8, 1 + mtspr L2CSR1, r3 + + /* Enable L2, no parity. L2CSR0_L2E = 0x80000000. */ + LOAD_ADDR32(r3, L2CSR0_L2E) + mtspr L2CSR0, r3 + isync +l2_poll_enable_e6500: + mfspr r3, L2CSR0 + andis. r1, r3, (L2CSR0_L2E)@h + beq l2_poll_enable_e6500 +l2_init_skip_e6500: #elif defined(CORE_E5500) /* --- L2 E5500 --- */ l2_setup_cache: @@ -160,8 +272,9 @@ l2_poll_invclear: addi r3, r8, 1 mtspr L2CSR1, r3 - /* enable L2 with no parity */ - lis r3, (L2CSR0_L2E)@h + /* enable L2 with no parity. + * L2CSR0_L2E=0x80000000 has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r3, L2CSR0_L2E) mtspr L2CSR0, r3 isync 2: @@ -171,33 +284,37 @@ l2_poll_invclear: #endif #endif /* CORE_E5500 || CORE_E6500 */ 3: - /* setup mapping for the spin table, WIMGE=0b00100 */ - lis r13, TORESET(_spin_table_addr)@h - ori r13, r13, TORESET(_spin_table_addr)@l + /* setup mapping for the spin table, WIMGE=0b00100. + * TORESET(...) has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r13, TORESET(_spin_table_addr)) lwz r13, 0(r13) /* mask by 4K */ rlwinm r13, r13, 0, 0, 19 lis r11, (MAS0_TLBSEL(1) | MAS0_ESEL(1))@h mtspr MAS0, r11 - lis r11, (MAS1_VALID | MAS1_IPROT)@h - ori r11, r11, (MAS1_TS | MAS1_TSIZE(BOOKE_PAGESZ_4K))@l + /* MAS1_VALID=0x80000000 has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r11, (MAS1_VALID | MAS1_IPROT | MAS1_TS | + MAS1_TSIZE(BOOKE_PAGESZ_4K))) mtspr MAS1, r11 + /* Build MAS2 = r13 (spin table base, 4K aligned) | MAS2_M | MAS2_G. + * Note: both oris and ori must use r11 as source for the second op + * (the original code erroneously used r13 on the second op, which + * overwrote the upper 16 bits from the first oris). */ oris r11, r13, (MAS2_M | MAS2_G)@h - ori r11, r13, (MAS2_M | MAS2_G)@l + ori r11, r11, (MAS2_M | MAS2_G)@l mtspr MAS2, r11 oris r11, r13, (MAS3_SX | MAS3_SW | MAS3_SR)@h - ori r11, r13, (MAS3_SX | MAS3_SW | MAS3_SR)@l + ori r11, r11, (MAS3_SX | MAS3_SW | MAS3_SR)@l mtspr MAS3, r11 li r11, 0 mtspr MAS7, r11 tlbwe /* _bootpg_addr has the address of _second_half_boot_page - * jump there in AS=1 space with cache enabled - */ - lis r13, TORESET(_bootpg_addr)@h - ori r13, r13, TORESET(_bootpg_addr)@l + * jump there in AS=1 space with cache enabled. + * TORESET(...) has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r13, TORESET(_bootpg_addr)) lwz r11, 0(r13) mtspr SRR0, r11 mfmsr r13 @@ -268,7 +385,7 @@ _second_half_boot_page: li r8, 3 stw r8, ENTRY_ADDR_LOWER(r10) - /* mask branch address (64MB) to setup tlb */ + /* Align branch address to 64MB boundary for TLB mapping below */ rlwinm r12, r4, 0, 0, 5 /* setup registers before jump */ @@ -280,7 +397,7 @@ _second_half_boot_page: li r4, 0 li r5, 0 li r6, 0 - lis r7, (64 * 1024 * 1024)@h + lis r7, (64 * 1024 * 1024)@h /* r7 = IMA size (64MB per ePAPR) */ li r8, 0 li r9, 0 @@ -295,8 +412,8 @@ _second_half_boot_page: /* Add tlb 1 entry 0 64MB for new entry */ lis r10, (MAS0_TLBSEL(1) | MAS0_ESEL(0))@h mtspr MAS0, r10 - lis r10, (MAS1_VALID | MAS1_IPROT)@h - ori r10, r10, (MAS1_TSIZE(BOOKE_PAGESZ_64M))@l + /* MAS1_VALID=0x80000000 has bit 31 set; use LOAD_ADDR32 for e6500. */ + LOAD_ADDR32(r10, (MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(BOOKE_PAGESZ_64M))) mtspr MAS1, r10 mtspr MAS2, r12 /* WIMGE = 0 */ ori r12, r12, (MAS3_SX | MAS3_SW | MAS3_SR) @@ -309,7 +426,7 @@ _second_half_boot_page: rfi /* Reserve space for spin table entries */ - .align 6 /* 64-bytes */ + .align 6 /* 64-byte alignment for spin table entries (ENTRY_SIZE) */ .globl _spin_table _spin_table: .space CPU_NUMCORES * ENTRY_SIZE diff --git a/src/boot_ppc_start.S b/src/boot_ppc_start.S index 725639a0d3..352b3cd7c8 100644 --- a/src/boot_ppc_start.S +++ b/src/boot_ppc_start.S @@ -87,6 +87,87 @@ All TLBs for boot will be in TLB1 and supervisor mode (not user) ori reg, reg, (addr)@l #endif +/* Early UART debug output for boot diagnostics. + * Uses DUART0 at CCSRBAR + 0x11C500 (THR=+0, LSR=+5, LCR=+3, DLB=+0, DMB=+1). + * Before CCSRBAR relocation: 0xFE11C500 (default). + * After relocation on VPX3-152: 0xEF11C500. + * Baud divisor for 115200 @ 533.333MHz platform clock: + * divisor = platform_clock / (16 * baud) = 533333333 / (16 * 115200) = 289 = 0x0121 + * However platform clock varies; a safe default divisor is from the CW PABS + * U-Boot which sets 115200 baud. We re-init to be safe. + * Clobbers r10, r11. */ +#if defined(DEBUG_UART) && defined(CORE_E6500) +#define EARLY_UART_BASE (CCSRBAR_DEF + 0x11C500) +/* UART baud divisor: platform_clk / (16 * 115200) + * T2080 CCB=533.333MHz, platform=CCB/2=266.667MHz? No, UART uses platform clock. + * CW VPX3-152: CCB=533.333 MHz, UART clock = CCB/2 = 266.667 MHz + * Divisor = 266666667 / (16 * 115200) = 144.68 ~ 145 = 0x0091 + * But CW U-Boot uses divisor for 115200 at their clock. Let's try 0x0091. */ +/* T2080 DUART baud divisor - computed from platform clock at runtime + * in C code (hal_init). For early assembly debug, skip baud init and + * just send raw characters. They will appear garbled but serve as + * progress indicators (each unique byte = different boot stage). */ +/* CW VPX3-152 (608605-100 RCW) - 1.8 GHz / 600 MHz platform clock: + * bus_clk = SYS_CLK * plat_ratio / 2 = 66.667 MHz * 9 / 2 = 300 MHz + * divisor = bus_clk / (16 * 115200) = 300000000 / 1843200 = 163 = 0xA3 + * For CW PABS (608609-100 RCW) - 1.2 GHz / 533 MHz platform clock: + * bus_clk = 66.667 * 8 / 2 = 266.667 MHz, divisor = 145 = 0x91 */ +#ifdef BOARD_CW_VPX3152 +#define UART_DIVISOR_HI 0x00 +#define UART_DIVISOR_LO 0xA3 +#else +#define UART_DIVISOR_HI 0x00 +#define UART_DIVISOR_LO 0x91 +#endif +.macro uart_init_at base + li r10, 0 + oris r10, r10, (\base)@h + ori r10, r10, (\base)@l + /* Set DLAB to access divisor registers */ + li r11, 0x83 /* LCR: DLAB=1, 8N1 */ + stb r11, 3(r10) /* LCR */ + li r11, UART_DIVISOR_LO + stb r11, 0(r10) /* DLB (divisor low) */ + li r11, UART_DIVISOR_HI + stb r11, 1(r10) /* DMB (divisor high) */ + /* Clear DLAB, set 8N1 */ + li r11, 0x03 /* LCR: DLAB=0, 8N1 */ + stb r11, 3(r10) /* LCR */ + /* Enable and reset FIFOs */ + li r11, 0x07 /* FCR: FIFO enable, reset TX+RX */ + stb r11, 2(r10) /* FCR */ + /* MCR: DTR + RTS */ + li r11, 0x03 + stb r11, 4(r10) /* MCR */ +.endm +.macro debug_char_imm char + li r10, 0 + oris r10, r10, (EARLY_UART_BASE)@h + ori r10, r10, (EARLY_UART_BASE)@l +1: lbz r11, 5(r10) /* LSR */ + andi. r11, r11, 0x20 /* THRE */ + beq 1b + li r11, \char + stb r11, 0(r10) /* THR */ +.endm +/* After CCSRBAR relocation, use new base address */ +.macro debug_char_new char + li r10, 0 + oris r10, r10, (CCSRBAR + 0x11C500)@h + ori r10, r10, (CCSRBAR + 0x11C500)@l +1: lbz r11, 5(r10) /* LSR */ + andi. r11, r11, 0x20 /* THRE */ + beq 1b + li r11, \char + stb r11, 0(r10) /* THR */ +.endm +#else +.macro debug_char_imm char +.endm +.macro debug_char_new char +.endm +#endif + /* variables from linker script */ .global _start_vector .global isr_empty @@ -151,6 +232,73 @@ _reset: mtspr SPRN_HDBCR0, r3 #endif +#ifdef TARGET_nxp_t2080 + /* T2080 rev-1 errata workarounds (PVR major_rev == 0x1). + * Mirrors CW U-Boot's mpc85xx/start.S CONFIG_SYS_FSL_ERRATUM_* set + * for the T-series e6500. Without these the core may exhibit + * subtle cache speculation / coherency / DVA-stale bugs that can + * silently corrupt OS handoff state. + * + * VPX3-152 ships rev 1.1 (PVR 0x85380011) per CW U-Boot banner. + * Read PVR major_rev and gate the rev-1-only fixes accordingly. */ + mfspr r3, SPRN_PVR + rlwinm r3, r3, 28, 0xf /* r3 = major_rev */ + cmpwi r3, 0x1 /* is rev 1? */ + bne t2080_errata_done + + /* A004779 — HDBCR2[32]=1, [33:39]=3, [41:47]=3 + * Clear mask 0xff7f0000, then set 0x83030000. */ + msync + isync + mfspr r3, SPRN_HDBCR2 + lis r4, 0xff7f + andc r3, r3, r4 + oris r3, r3, 0x8303 + mtspr SPRN_HDBCR2, r3 + isync + + /* A004786 — HDBCR7 |= 0x80000000 */ + msync + isync + mfspr r3, SPRN_HDBCR7 + oris r3, r3, 0x8000 + mtspr SPRN_HDBCR7, r3 + isync + + /* A004792 — HDBCR0 |= 0x4000 */ + msync + isync + mfspr r3, SPRN_HDBCR0 + ori r3, r3, 0x4000 + mtspr SPRN_HDBCR0, r3 + isync + + /* A004806 — HDBCR7 |= 0x20000000 (DVA stale-data fix) */ + msync + isync + mfspr r3, SPRN_HDBCR7 + oris r3, r3, 0x2000 + mtspr SPRN_HDBCR7, r3 + isync + + /* A004809 — HDBCR0 |= 0x01008000 */ + msync + isync + mfspr r3, SPRN_HDBCR0 + oris r3, r3, 0x0100 + ori r3, r3, 0x8000 + mtspr SPRN_HDBCR0, r3 + isync + +t2080_errata_done: + + /* A003999 (HDBCR1 |= 0x01000000) is NOT enabled by CW U-Boot's + * config_mpc85xx.h for the T2080 SoC. SPR dump comparison + * confirmed wolfBoot was setting HDBCR1=0x01000000 while U-Boot + * left it at 0 -- a real divergence in pre-VxWorks state. Match + * U-Boot exactly: skip the A003999 workaround. */ +#endif /* TARGET_nxp_t2080 */ + reset_exceptions: /* Reset exception registers */ li r0, 0x0000 @@ -335,28 +483,37 @@ setup_interrupts: #if CCSRBAR_DEF != CCSRBAR_PHYS /* Use R8 = new, R9 = old virtual */ - lis r8, CCSRBAR@h - ori r8, r8, CCSRBAR@l - lis r9, (CCSRBAR + 0x1000)@h - ori r9, r9, (CCSRBAR + 0x1000)@l + LOAD_ADDR32(r8, CCSRBAR) + LOAD_ADDR32(r9, (CCSRBAR + 0x1000)) create_temp_ccsr: - /* Create a temporary TLB entry for new and old location */ - /* CCSRBAR: TLB 0, Entry 0, Supervisor R/W, IG, TS=0, 4KB */ + /* Create temporary TLB0 entries for CCSRBAR relocation. + * + * TLB0 on e6500 is 4-way set-associative (2048 entries, 512 sets). + * The "esel" parameter selects the WAY within a set; the SET is + * determined by the virtual address (EPN). These two entries map + * different EPNs (CCSRBAR vs CCSRBAR+0x1000), so they fall in + * different TLB0 sets and do not overwrite each other. + * + * We use different ways (0 and 1) for visual clarity. Both entries + * are cleaned up by the TLB0 flash-invalidate (MMUCSR0) after + * relocation completes. */ + + /* CCSRBAR new location: TLB0 Way 0, Supervisor R/W, IG, TS=0, 4KB */ set_tlb(0, 0, CCSRBAR, CCSRBAR, CCSRBAR_PHYS_HIGH, MAS3_SR | MAS3_SW, MAS2_I | MAS2_G, 0, BOOKE_PAGESZ_4K, 0, r3); - set_tlb(0, 0, + /* CCSRBAR old location: TLB0 Way 1, Supervisor R/W, IG, TS=0, 4KB */ + set_tlb(0, 1, CCSRBAR + 0x1000, CCSRBAR_DEF, 0, MAS3_SR | MAS3_SW, MAS2_I | MAS2_G, 0, BOOKE_PAGESZ_4K, 0, r3); verify_old_ccsr: /* verify the TLB is for old one */ - lis r0, CCSRBAR_DEF@h - ori r0, r0, CCSRBAR_DEF@l + LOAD_ADDR32(r0, CCSRBAR_DEF) #ifdef USE_CORENET_INTERFACE lwz r1, 4(r9) /* CCSRBARL */ #else @@ -373,12 +530,9 @@ ccsr_temp_law: #define CCSR_TEMP_LAW (LAWAR_ENABLE | \ LAWAR_TRGT_ID(LAW_TRGT_CORENET) | \ LAW_SIZE_4KB) - lis r0, CCSRBAR_PHYS_HIGH@h - ori r0, r0, CCSRBAR_PHYS_HIGH@l - lis r1, CCSRBAR_DEF@h - ori r1, r1, CCSRBAR_DEF@l - lis r2, CCSR_TEMP_LAW@h - ori r2, r2, CCSR_TEMP_LAW@l + LOAD_ADDR32(r0, CCSRBAR_PHYS_HIGH) + LOAD_ADDR32(r1, CCSRBAR_DEF) + LOAD_ADDR32(r2, CCSR_TEMP_LAW) stw r0, LAWBAR_BASE(0)(r9) /* LAWBARH */ stw r1, LAWBAR_BASE(0)+4(r9) /* LAWBARL */ sync @@ -393,13 +547,10 @@ read_old_ccsr: isync write_new_ccsrbar: - lis r0, CCSRBAR_PHYS_HIGH@h - ori r0, r0, CCSRBAR_PHYS_HIGH@l - lis r1, CCSRBAR@h - ori r1, r1, CCSRBAR@l + LOAD_ADDR32(r0, CCSRBAR_PHYS_HIGH) + LOAD_ADDR32(r1, CCSRBAR) #define CCSRAR_C 0x80000000 /* Commit */ - lis r2, CCSRAR_C@h - ori r2, r2, CCSRAR_C@l + LOAD_ADDR32(r2, CCSRAR_C) stw r0, 0(r9) /* CCSRBARH */ sync stw r1, 4(r9) /* CCSRBARL */ @@ -414,9 +565,10 @@ write_new_ccsrbar: sync lwz r0, 0(r9) isync - /* write new CCSBAR */ - lis r0, (CCSRBAR_PHYS_HIGH << 20) | (CCSRBAR >> 12)@h - ori r0, r0, (CCSRBAR_PHYS_HIGH << 20) | (CCSRBAR >> 12)@l + /* write new CCSBAR: encode 36-bit PA as [23:0] = PA[35:12] + * CCSRBAR_NEW_REG is precomputed in nxp_ppc.h since GAS @h/@l + * can't evaluate complex shift/OR expressions. */ + LOAD_ADDR32(r0, CCSRBAR_NEW_REG) stw r0, 0(r9) sync isync @@ -431,6 +583,12 @@ invalidate_temp_tlb: /* L2TLB0_FI: TLB0 flash invalidate (write 1 to invalidate) */ li r3, 0x04 mtspr MMUCSR0, r3 + + /* Re-create TLB1 Entry 1 for the new (relocated) CCSRBAR address */ + set_tlb(1, 1, + CCSRBAR, CCSRBAR, CCSRBAR_PHYS_HIGH, + MAS3_SX | MAS3_SR | MAS3_SW, MAS2_I | MAS2_G, 0, + CCSRBAR_SIZE, 1, r3); #endif /* CCSRBAR_DEF != CCSRBAR_PHYS */ @@ -452,26 +610,30 @@ boot_page: 1: #endif +#if CCSRBAR_DEF == CCSRBAR ccsr_tlb: - /* CCSRBAR: TLB 1, Entry 1, Supervisor R/W, IG, TS=0, 1M/16M, IPROT */ + /* No relocation -- map CCSRBAR directly in TLB1 Entry 1 */ set_tlb(1, 1, CCSRBAR, CCSRBAR, CCSRBAR_PHYS_HIGH, MAS3_SX | MAS3_SR | MAS3_SW, MAS2_I | MAS2_G, 0, CCSRBAR_SIZE, 1, r3); +#endif + /* When CCSRBAR was relocated, TLB1 Entry 1 was already re-created + * for the new address after relocation (above). Do NOT overwrite it. */ #if defined(CORE_E5500) || defined(CORE_E6500) ccsr_law: - /* CCSR - LAW0 (CoreNet 16MB) */ + /* CCSR - LAW0 (CoreNet 16MB) + * CoreNet T2080 requires a LAW to route CCSR transactions. + * CW U-Boot also has this LAW but it was not visible in our register + * dump because it's at a different index. */ #define CCSR_LAW (LAWAR_ENABLE | \ LAWAR_TRGT_ID(LAW_TRGT_CORENET) | \ LAW_SIZE_16MB) LOAD_ADDR32(r9, CCSRBAR + LAWBAR_BASE(0)) - lis r0, CCSRBAR_PHYS_HIGH@h - ori r0, r0, CCSRBAR_PHYS_HIGH@l - lis r1, CCSRBAR@h - ori r1, r1, CCSRBAR@l - lis r2, CCSR_LAW@h - ori r2, r2, CCSR_LAW@l + LOAD_ADDR32(r0, CCSRBAR_PHYS_HIGH) + LOAD_ADDR32(r1, CCSRBAR) + LOAD_ADDR32(r2, CCSR_LAW) stw r0, 0(r9) /* LAWBARH */ stw r1, 4(r9) /* LAWBARL */ sync @@ -483,19 +645,20 @@ ccsr_law: #ifdef FLASH_BASE_ADDR #if defined(CORE_E5500) || defined(CORE_E6500) - /* Memory Mapped NOR Flash (64/128MB) at 0xEC000000/0xE8000000 */ + /* Memory Mapped NOR Flash. For CW VPX3-152 (256 MB at 0xF0000000), + * this entry overlaps with the boot ROM TLB at top of flash. We + * handle that below by invalidating the boot ROM TLB after this + * entry is created (the IPROT-protected 256 MB Entry 2 will serve + * instruction fetches for the entire flash region, including boot ROM). */ flash_law: /* FLASH - LAW1 (IFC 64/128MB) */ #define FLASH_LAW (LAWAR_ENABLE | \ LAWAR_TRGT_ID(LAW_TRGT_IFC) | \ FLASH_LAW_SIZE) LOAD_ADDR32(r9, CCSRBAR + LAWBAR_BASE(1)) - lis r0, FLASH_BASE_PHYS_HIGH@h - ori r0, r0, FLASH_BASE_PHYS_HIGH@l - lis r1, FLASH_BASE_ADDR@h - ori r1, r1, FLASH_BASE_ADDR@l - lis r2, FLASH_LAW@h - ori r2, r2, FLASH_LAW@l + LOAD_ADDR32(r0, FLASH_BASE_PHYS_HIGH) + LOAD_ADDR32(r1, FLASH_BASE_ADDR) + LOAD_ADDR32(r2, FLASH_LAW) stw r0, 0(r9) /* LAWBARH */ stw r1, 4(r9) /* LAWBARL */ sync @@ -516,6 +679,33 @@ flash_tlb: FLASH_BASE_ADDR, FLASH_BASE_ADDR, FLASH_BASE_PHYS_HIGH, MAS3_SX | MAS3_SW | MAS3_SR, FLASH_TLB_WING, 0, FLASH_TLB_PAGESZ, 1, r3); + +#ifdef BOARD_CW_VPX3152 + /* CW VPX3-152: the 256 MB flash TLB Entry 2 created above overlaps + * with the boot ROM TLB (Entry 0, at 0xFFFC0000-0xFFFFFFFF after + * shrink_default_tlb1) because flash covers 0xF0000000-0xFFFFFFFF. + * e6500 TLB1 multi-hit is a machine check. Invalidate the boot ROM + * TLB now -- Entry 2 (IPROT) will serve instruction fetches for the + * boot ROM region. Must clear IPROT via tlbwe before tlbivax, since + * IPROT entries are protected from invalidation. + * + * Uses r14 (saved ESEL from shrink_default_tlb1) to target the + * correct entry. */ +vpx3_invalidate_boot_tlb: + /* Build MAS0 = TLBSEL=1, ESEL=r14 */ + rlwinm r3, r14, 16, MAS0_ESEL_MSK + oris r3, r3, MAS0_TLBSEL(1)@h + mtspr MAS0, r3 + tlbre /* read Entry r14 into MAS1-3/7 */ + mfspr r3, MAS1 + LOAD_ADDR32(r4, MAS1_IPROT) + andc r3, r3, r4 /* clear IPROT */ + rlwinm r3, r3, 0, 1, 31 /* clear V (bit 0) */ + mtspr MAS1, r3 + tlbwe /* write back with V=0, IPROT=0 */ + isync + msync +#endif #else flash_tlb: /* For TS/AS=1 map boot ROM */ @@ -1192,8 +1382,12 @@ isr_empty: * initialized on cold boot) -> nested machine check -> checkstop. * Use r3 as base, r4 as scratch. */ #if defined(DEBUG_UART) && defined(TARGET_nxp_t2080) - /* Print '!' to UART to signal exception occurred */ - LOAD_ADDR32(r3, 0xFE11C500) + /* Print '!' + SRR0 hex to UART to show exception address. + * Use CCSRBAR (which is the relocated address on VPX3-152). */ + LOAD_ADDR32(r3, CCSRBAR + 0x11C500) + mfspr r5, SRR0 /* save fault address before we clobber regs */ + + /* Print '!' */ .L_isr_wait: lbz r4, 5(r3) andi. r4, r4, 0x20 @@ -1201,6 +1395,39 @@ isr_empty: li r4, '!' stb r4, 0(r3) eieio + + /* Print SRR0 as 8 hex chars */ + li r6, 8 /* 8 nibbles */ +.L_isr_hex: + /* Wait for UART ready */ +.L_isr_hw: + lbz r4, 5(r3) + andi. r4, r4, 0x20 + beq .L_isr_hw + /* Extract top nibble from r5 */ + rlwinm r4, r5, 4, 28, 31 /* r4 = (r5 >> 28) & 0xF */ + rlwinm r5, r5, 4, 0, 31 /* r5 <<= 4 */ + cmplwi r4, 10 + blt .L_isr_digit + addi r4, r4, ('A' - 10) + b .L_isr_put +.L_isr_digit: + addi r4, r4, '0' +.L_isr_put: + stb r4, 0(r3) + eieio + addi r6, r6, -1 + cmplwi r6, 0 + bne .L_isr_hex + + /* Print newline */ +.L_isr_nlw: + lbz r4, 5(r3) + andi. r4, r4, 0x20 + beq .L_isr_nlw + li r4, '\n' + stb r4, 0(r3) + eieio #endif #ifdef L2SRAM_ADDR LOAD_ADDR32(r3, L2SRAM_ADDR + 0x200) @@ -1223,6 +1450,8 @@ isr_empty: stw r4, 0x1C(r3) #endif 1: b 1b + .global isr_empty_end +isr_empty_end: #endif /* reset entry point - must be at end of .S */ diff --git a/src/fdt.c b/src/fdt.c index 86b6afadff..ec1f28e487 100644 --- a/src/fdt.c +++ b/src/fdt.c @@ -760,6 +760,67 @@ int fdt_shrink(void* fdt) return fdt_set_totalsize(fdt, total_size); } +/* Append a /memreserve/ entry. Inserts before the (0,0) terminator and + * shifts the structure block + strings block down by 16 bytes. Caller must + * have already grown totalsize via fdt_set_totalsize() to leave headroom. */ +int fdt_add_mem_rsv(void* fdt, uint64_t address, uint64_t size) +{ + struct fdt_reserve_entry* rsv; + uint8_t* base = (uint8_t*)fdt; + uint32_t off_rsv; + uint32_t off_dt; + uint32_t off_str; + uint32_t size_str; + uint32_t total; + uint32_t data_end; + uint32_t shift; + uint32_t i; + + if (fdt == NULL) { + return -FDT_ERR_BADSTATE; + } + + off_rsv = fdt_off_mem_rsvmap(fdt); + off_dt = fdt_off_dt_struct(fdt); + off_str = fdt_off_dt_strings(fdt); + size_str = fdt_size_dt_strings(fdt); + total = fdt_totalsize(fdt); + data_end = off_str + size_str; + shift = (uint32_t)sizeof(struct fdt_reserve_entry); /* 16 */ + + if ((data_end + shift) > total) { + return -FDT_ERR_NOSPACE; + } + + /* Find the (0,0) terminator in the reserve map. */ + rsv = (struct fdt_reserve_entry*)(base + off_rsv); + i = 0; + while ((rsv[i].address != 0ULL) || (rsv[i].size != 0ULL)) { + i++; + if ((off_rsv + (i + 1U) * shift) > off_dt) { + return -FDT_ERR_BADSTRUCTURE; + } + } + + /* Shift structure + strings down by 16 bytes. memmove handles overlap. */ + memmove(base + off_dt + shift, base + off_dt, + (size_t)((off_str + size_str) - off_dt)); + + /* Insert new entry where the old terminator was, write new terminator. */ + rsv[i].address = cpu_to_fdt64(address); + rsv[i].size = cpu_to_fdt64(size); + rsv[i + 1].address = 0; + rsv[i + 1].size = 0; + + /* Update header offsets. */ + fdt_set_off_dt_struct(fdt, off_dt + shift); + fdt_set_off_dt_strings(fdt, off_str + shift); + + wolfBoot_printf("FDT: /memreserve/ +0x%llx +0x%llx\n", + (unsigned long long)address, (unsigned long long)size); + return 0; +} + /* FTD Fixup API's */ int fdt_fixup_str(void* fdt, int off, const char* node, const char* name, const char* str) diff --git a/src/string.c b/src/string.c index 176836b4a2..c0fe01ee88 100644 --- a/src/string.c +++ b/src/string.c @@ -26,6 +26,22 @@ #define _FORTIFY_SOURCE 0 #endif +/* Enable %llu/%llx support only on platforms where the 64-bit divide + * is either native (64-bit CPUs) or backed by linked libgcc helpers + * (PPC32 toolchain). The auto-enable deliberately excludes 32-bit + * bare-metal targets (Cortex-M, x86 stage1) -- those would link-fail + * on __aeabi_uldivmod / __udivmoddi4 because they don't pull in libgcc. + * Such targets can still opt-in by defining PRINTF_LONG_LONG manually. */ +#if !defined(PRINTF_LONG_LONG) && ( \ + defined(__alpha__) || defined(__ia64__) || defined(_ARCH_PPC64) || \ + defined(__x86_64__) || defined(_M_X64) || \ + defined(__aarch64__) || defined(_M_ARM64) || \ + defined(__sparc64__) || defined(__s390x__) || \ + defined(__ppc64__) || defined(__powerpc64__) || defined(__PPC__) || \ + (defined(__riscv_xlen) && (__riscv_xlen == 64))) + #define PRINTF_LONG_LONG +#endif + #include #if !defined(TARGET_library) && defined(__STDC_HOSTED__) && __STDC_HOSTED__ \ && !defined(__CCRX__) @@ -318,42 +334,87 @@ void *memmove(void *dst, const void *src, size_t n) #endif /* WOLFBOOT_USE_STDLIBC */ #if defined(PRINTF_ENABLED) && defined(DEBUG_UART) +/* Shared digit table -- avoids duplicating the literal in each width's + * formatting loop. */ +static const char uart_writenum_digits[] = "0123456789ABCDEF"; + +/* Shared tail for both widths: applies zeropad spacing, slides the digit + * run into place, and emits to UART. Caller has filled digits at the + * tail of `buf` and counted them in `sz`; `i` is the prefix length + * (sign character only). */ +static void uart_writenum_emit(char *buf, int bufsize, int i, int sz, + int zeropad, int maxdigits) +{ + if (zeropad && sz < maxdigits) { + i += maxdigits - sz; + } + memmove(&buf[i], &buf[bufsize - sz], sz); + uart_write(buf, i + sz); +} + void uart_writenum(int num, int base, int zeropad, int maxdigits) { - int i = 0; - char buf[sizeof(unsigned long)*2+1]; - const char* kDigitLut = "0123456789ABCDEF"; + int i = 0, sz = 0; + char buf[sizeof(unsigned int) * 2 + 2]; unsigned int val = (unsigned int)num; - int sz = 0; if (maxdigits == 0) maxdigits = 8; if (maxdigits > (int)sizeof(buf)) maxdigits = (int)sizeof(buf); memset(buf, 0, sizeof(buf)); - if (base == 10 && num < 0) { /* handle negative */ + if (base == 10 && num < 0) { buf[i++] = '-'; - val = -num; + val = (unsigned int)(-num); } if (zeropad) { memset(&buf[i], '0', maxdigits); } + /* 32-bit divide: stays out of libgcc 64-bit helpers, which aren't + * linked into freestanding stage1 / Cortex-M builds. */ do { - buf[sizeof(buf)-sz-1] = kDigitLut[(val % base)]; + buf[sizeof(buf) - sz - 1] = + uart_writenum_digits[(val % (unsigned)base)]; sz++; - val /= base; + val /= (unsigned)base; } while (val > 0U); - if (zeropad && sz < maxdigits) { - i += maxdigits-sz; + uart_writenum_emit(buf, sizeof(buf), i, sz, zeropad, maxdigits); +} + +#ifdef PRINTF_LONG_LONG +/* 64-bit core for %llu/%lld/%llx. Pulls in libgcc 64-bit divide + * (__udivmoddi4 / __aeabi_uldivmod) so it's only compiled when needed + * and only called from the long-long printf paths -- never from the + * 32-bit uart_writenum() fast path above. */ +static void uart_writenum_ll(unsigned long long val, int is_negative, + int base, int zeropad, int maxdigits) +{ + int i = 0, sz = 0; + char buf[sizeof(unsigned long long) * 2 + 2]; + if (maxdigits == 0) + maxdigits = 8; + if (maxdigits > (int)sizeof(buf)) + maxdigits = (int)sizeof(buf); + memset(buf, 0, sizeof(buf)); + if (is_negative) { + buf[i++] = '-'; } - memmove(&buf[i], &buf[sizeof(buf)-sz], sz); - i+=sz; - uart_write(buf, i); + if (zeropad) { + memset(&buf[i], '0', maxdigits); + } + do { + buf[sizeof(buf) - sz - 1] = + uart_writenum_digits[(val % (unsigned)base)]; + sz++; + val /= (unsigned)base; + } while (val > 0ULL); + uart_writenum_emit(buf, sizeof(buf), i, sz, zeropad, maxdigits); } +#endif /* PRINTF_LONG_LONG */ void uart_vprintf(const char* fmt, va_list argp) { char* fmtp = (char*)fmt; - int zeropad, maxdigits, precision, leftjust; + int zeropad, maxdigits, precision, leftjust, islong; while (fmtp != NULL && *fmtp != '\0') { /* print non formatting characters */ if (*fmtp != '%') { @@ -363,7 +424,7 @@ void uart_vprintf(const char* fmt, va_list argp) fmtp++; /* skip % */ /* find formatters */ - zeropad = maxdigits = leftjust = 0; + zeropad = maxdigits = leftjust = islong = 0; precision = -1; /* -1 = not specified */ /* check for left-justify flag */ if (*fmtp == '-') { @@ -400,7 +461,7 @@ void uart_vprintf(const char* fmt, va_list argp) } } else if (*fmtp == 'l') { - /* long - skip */ + islong++; fmtp++; } else if (*fmtp == 'z') { @@ -420,8 +481,32 @@ void uart_vprintf(const char* fmt, va_list argp) case 'i': case 'd': { - int n = (int)va_arg(argp, int); - uart_writenum(n, 10, zeropad, maxdigits); + #ifdef PRINTF_LONG_LONG + if (islong >= 2) { + /* %llu / %lld: full 64-bit value */ + int is_neg = 0; + unsigned long long val; + if (*fmtp != 'u') { + long long sll = va_arg(argp, long long); + if (sll < 0) { + is_neg = 1; + val = (unsigned long long)(-sll); + } + else { + val = (unsigned long long)sll; + } + } + else { + val = va_arg(argp, unsigned long long); + } + uart_writenum_ll(val, is_neg, 10, zeropad, maxdigits); + } + else + #endif + { + int n = (int)va_arg(argp, int); + uart_writenum(n, 10, zeropad, maxdigits); + } break; } case 'p': @@ -430,8 +515,19 @@ void uart_vprintf(const char* fmt, va_list argp) case 'x': case 'X': { - int n = (int)va_arg(argp, int); - uart_writenum(n, 16, zeropad, maxdigits); + #ifdef PRINTF_LONG_LONG + if (islong >= 2) { + /* %llx: full 64-bit value */ + unsigned long long val = + va_arg(argp, unsigned long long); + uart_writenum_ll(val, 0, 16, zeropad, maxdigits); + } + else + #endif + { + int n = (int)va_arg(argp, int); + uart_writenum(n, 16, zeropad, maxdigits); + } break; } case 's': diff --git a/src/update_ram.c b/src/update_ram.c index f8e3d0f9e4..41cfc0f350 100644 --- a/src/update_ram.c +++ b/src/update_ram.c @@ -320,10 +320,10 @@ void RAMFUNCTION wolfBoot_start(void) image_ptr = wolfBoot_peek_image(&os_image, 0, NULL); if (image_ptr) { if (*((uint32_t*)image_ptr) == UBOOT_IMG_HDR_MAGIC) { - /* Note: Could parse header and get load address at 0x10 */ - - /* Skip 64 bytes (size of Legacy format image header) */ - load_address += UBOOT_IMG_HDR_SZ; + /* Skip 64-byte legacy header in source; load address unchanged + * so kernel is placed at WOLFBOOT_LOAD_ADDRESS */ + wolfBoot_printf("U-Boot Legacy header detected, skipping %d bytes\n", + UBOOT_IMG_HDR_SZ); os_image.fw_base += UBOOT_IMG_HDR_SZ; os_image.fw_size -= UBOOT_IMG_HDR_SZ; }