From 92134950892f22b9c7779c64de0bf719daaef7c3 Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <hsggebhardt@fastmail.com>
Date: Sun, 1 Jun 2025 00:01:12 -0700
Subject: [PATCH 1/6] Add support for FFTs with dimension > 2

---
 src/fft.jl        | 24 ++++++++++++------------
 test/fft_tests.jl | 10 ++++++++++
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 13fd457..a5fca5c 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -28,29 +28,29 @@ function generic_fft!(x::AbstractVector{T}, region) where {T<:AbstractFloats}
     copyto!(x, generic_fft(x))
 end
 
-function generic_fft!(x::AbstractMatrix{T}, region::Integer) where T<:AbstractFloats
-    if region == 1
-        for j in 1:size(x, 2)
-            x[:, j] .= generic_fft(@view x[:, j])
-        end
-    else
-        for k in 1:size(x, 1)
-            x[k, :] .= generic_fft(@view x[k, :])
-        end
+function _generic_fft!(x, Rpre, Rpost)
+    for Ipost in Rpost, Ipre in Rpre
+        generic_fft!(@view x[Ipre, :, Ipost])
     end
     x
 end
 
-function generic_fft!(x::AbstractMatrix{T}, region) where T<:AbstractFloats
+function generic_fft!(x, region::Integer)
+    Rpre = CartesianIndices(size(x)[1:region-1])
+    Rpost = CartesianIndices(size(x)[region+1:end])
+    _generic_fft!(x, Rpre, Rpost)
+end
+
+function generic_fft!(x, region)
     for r in region
         generic_fft!(x, r)
     end
     x
 end
 
-generic_fft(x::AbstractMatrix{T}, region::Integer) where T<:AbstractFloats = generic_fft!(copy(x), region)
+generic_fft(x, region::Integer) = generic_fft!(copy(x), region)
 
-generic_fft(x::AbstractMatrix{T}, region=ntuple(identity, ndims(x))) where T<:AbstractFloats = generic_fft!(copy(x), region)
+generic_fft(x, region=ntuple(identity, ndims(x))) = generic_fft!(copy(x), region)
 
 function generic_fft(x::AbstractVector{T}) where T<:AbstractFloats
     n = length(x)
diff --git a/test/fft_tests.jl b/test/fft_tests.jl
index f48613b..69d9d1d 100644
--- a/test/fft_tests.jl
+++ b/test/fft_tests.jl
@@ -176,4 +176,14 @@ end
     X̃ = copy(X)
     @test generic_fft(X) ≈ generic_fft(X, 1:2) ≈ generic_fft!(X̃,1:2) ≈ fft(X)
     @test X̃ ≈ fft(X)
+
+    X = randn(ComplexF64, 5, 6, 7)
+    for d in 1:3
+        X̃ = copy(X)
+        @test generic_fft(X,d) ≈ generic_fft(X, d:d) ≈ generic_fft!(X̃, (d,)) ≈ fft(X,d)
+        @test X̃ ≈ fft(X,d)
+    end
+    X̃ = copy(X)
+    @test generic_fft(X) ≈ generic_fft(X, 1:ndims(X)) ≈ generic_fft!(X̃, 1:ndims(X̃)) ≈ fft(X)
+    @test X̃ ≈ fft(X)
 end

From 034e8740aa6b16812a62b42fcb73ea13cd3aadd9 Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <hsggebhardt@fastmail.com>
Date: Sun, 1 Jun 2025 12:01:47 -0700
Subject: [PATCH 2/6] generic_fft!(X): set default for region parameter

---
 src/fft.jl        |  2 +-
 test/fft_tests.jl | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index a5fca5c..de93e98 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -41,7 +41,7 @@ function generic_fft!(x, region::Integer)
     _generic_fft!(x, Rpre, Rpost)
 end
 
-function generic_fft!(x, region)
+function generic_fft!(x, region=ntuple(identity, ndims(x)))
     for r in region
         generic_fft!(x, r)
     end
diff --git a/test/fft_tests.jl b/test/fft_tests.jl
index 69d9d1d..c27d227 100644
--- a/test/fft_tests.jl
+++ b/test/fft_tests.jl
@@ -174,7 +174,7 @@ end
     @test generic_fft(X,2) ≈ generic_fft(X, 2:2) ≈ generic_fft!(X̃,2) ≈ fft(X,2)
     @test X̃ ≈ fft(X,2)
     X̃ = copy(X)
-    @test generic_fft(X) ≈ generic_fft(X, 1:2) ≈ generic_fft!(X̃,1:2) ≈ fft(X)
+    @test generic_fft(X) ≈ generic_fft(X, 1:2) ≈ generic_fft!(X̃) ≈ fft(X)
     @test X̃ ≈ fft(X)
 
     X = randn(ComplexF64, 5, 6, 7)
@@ -183,7 +183,13 @@ end
         @test generic_fft(X,d) ≈ generic_fft(X, d:d) ≈ generic_fft!(X̃, (d,)) ≈ fft(X,d)
         @test X̃ ≈ fft(X,d)
     end
-    X̃ = copy(X)
-    @test generic_fft(X) ≈ generic_fft(X, 1:ndims(X)) ≈ generic_fft!(X̃, 1:ndims(X̃)) ≈ fft(X)
-    @test X̃ ≈ fft(X)
+    X1 = copy(X)
+    X2 = copy(X)
+    @test generic_fft(X) ≈ generic_fft(X, 1:ndims(X)) ≈ generic_fft!(X1, 1:ndims(X1)) ≈ generic_fft!(X2) ≈ fft(X)
+    @test generic_fft(X, (1,3)) ≈ fft(X, (1,3))
+    @test generic_fft(X, (2,3)) ≈ fft(X, (2,3))
+    @test generic_fft(X, (1,2)) ≈ fft(X, (1,2))
+    @test generic_fft(X, (2,1)) ≈ fft(X, (2,1))
+    @test X1 ≈ fft(X)
+    @test X2 ≈ fft(X)
 end

From 344fae1858237ac2814ceb9c9ad767be8b264986 Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <gebhardt@caltech.edu>
Date: Mon, 2 Jun 2025 14:46:04 -0700
Subject: [PATCH 3/6] fft: remove need for interlacing and deinterlacing

---
 src/fft.jl | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index de93e98..5f53ded 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -97,15 +97,14 @@ end
 # c_radix2.c in the GNU Scientific Library and four1 in the Numerical Recipes in C.
 # However, the trigonometric recurrence is improved for greater efficiency.
 # The algorithm starts with bit-reversal, then divides and conquers in-place.
-function generic_fft_pow2!(x::AbstractVector{T}) where T<:AbstractFloat
-    n,big2=length(x),2one(T)
+function generic_fft_pow2!(x::AbstractVector{Complex{T}}) where T<:AbstractFloat
+    n,big2=2length(x),2one(T)
     nn,j=n÷2,1
-    for i=1:2:n-1
+    for i=1:nn
         if j>i
             x[j], x[i] = x[i], x[j]
-            x[j+1], x[i+1] = x[i+1], x[j+1]
         end
-        m = nn
+        m = nn÷2
         while m ≥ 2 && j > m
             j -= m
             m = m÷2
@@ -121,9 +120,11 @@ function generic_fft_pow2!(x::AbstractVector{T}) where T<:AbstractFloat
         for m=1:2:logn-1
             for i=m:2logn:n
                 j=i+logn
-                mixr, mixi = wr*x[j]-wi*x[j+1], wr*x[j+1]+wi*x[j]
-                x[j], x[j+1] = x[i]-mixr, x[i+1]-mixi
-                x[i], x[i+1] = x[i]+mixr, x[i+1]+mixi
+                i = i ÷ 2 + 1
+                j = j ÷ 2 + 1
+                mixr, mixi = wr*real(x[j])-wi*imag(x[j]), wr*imag(x[j])+wi*real(x[j])
+                x[j] = real(x[i])-mixr + im * (imag(x[i])-mixi)
+                x[i] = real(x[i])+mixr + im * (imag(x[i])+mixi)
             end
             wr = (wtemp=wr)*wpr-wi*wpi+wr
             wi = wi*wpr+wtemp*wpi+wi
@@ -134,16 +135,16 @@ function generic_fft_pow2!(x::AbstractVector{T}) where T<:AbstractFloat
 end
 
 function generic_fft_pow2(x::AbstractVector{Complex{T}}) where T<:AbstractFloat
-    y = interlace_complex(x)
-    generic_fft_pow2!(y)
-    return deinterlace_complex(y)
+    return generic_fft_pow2!(copy(x))
 end
 generic_fft_pow2(x::AbstractVector{T}) where T<:AbstractFloat = generic_fft_pow2(complex(x))
 
 function generic_ifft_pow2(x::AbstractVector{Complex{T}}) where T<:AbstractFloat
-    y = interlace_complex(x, -)
+    y = conj.(x)  # always create copy (conj(x) doesn't copy when eltype(x) is real)
     generic_fft_pow2!(y)
-    return ldiv!(T(length(x)), deinterlace_complex(y, -))
+    N = T(length(x))
+    @. y = conj(y) / N
+    return y
 end
 
 function generic_dct(x::StridedVector{T}, region::Integer) where T<:AbstractFloats

From d84deff440d66170724b3183a390aa100e72c01b Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <gebhardt@caltech.edu>
Date: Mon, 2 Jun 2025 15:13:17 -0700
Subject: [PATCH 4/6] fft: use complex numbers available in Julia

---
 src/fft.jl | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 5f53ded..b602991 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -114,22 +114,19 @@ function generic_fft_pow2!(x::AbstractVector{Complex{T}}) where T<:AbstractFloat
     logn = 2
     while logn < n
         θ=-big2/logn
-        wtemp = sinpi(θ/2)
-        wpr, wpi = -2wtemp^2, sinpi(θ)
-        wr, wi = one(T), zero(T)
-        for m=1:2:logn-1
-            for i=m:2logn:n
-                j=i+logn
-                i = i ÷ 2 + 1
-                j = j ÷ 2 + 1
-                mixr, mixi = wr*real(x[j])-wi*imag(x[j]), wr*imag(x[j])+wi*real(x[j])
-                x[j] = real(x[i])-mixr + im * (imag(x[i])-mixi)
-                x[i] = real(x[i])+mixr + im * (imag(x[i])+mixi)
+        wp = complex(-2sinpi(θ/2)^2, sinpi(θ))
+        w = complex(one(T))
+        lognn = logn ÷ 2
+        for m=1:lognn
+            for i=m:logn:nn
+                j=i+lognn
+                mix = w * x[j]
+                x[j] = x[i] - mix
+                x[i] = x[i] + mix
             end
-            wr = (wtemp=wr)*wpr-wi*wpi+wr
-            wi = wi*wpr+wtemp*wpi+wi
+            w = w * (1 + wp)
         end
-        logn = logn << 1
+        logn = 2logn
     end
     return x
 end

From 07f962795e4473bc2c76686dddd40111d2f21e5b Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <gebhardt@caltech.edu>
Date: Mon, 2 Jun 2025 16:04:32 -0700
Subject: [PATCH 5/6] test/fft_tests.jl: reduce allocations in generic_fft!()

---
 src/fft.jl        | 28 ++++++++--------------------
 test/fft_tests.jl |  9 +++++++++
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index b602991..e7fe7c5 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -8,26 +8,6 @@ const ComplexFloats = Complex{T} where T<:AbstractFloat
 # The following implements Bluestein's algorithm, following http://www.dsprelated.com/dspbooks/mdft/Bluestein_s_FFT_Algorithm.html
 # To add more types, add them in the union of the function's signature.
 
-function generic_fft(x::AbstractVector{T}, region::Integer) where T<:AbstractFloats
-    @assert region == 1
-    generic_fft(x)
-end
-
-function generic_fft!(x::AbstractVector{T}, region::Integer=1) where T<:AbstractFloats
-    @assert region == 1
-    copyto!(x, generic_fft(x))
-end
-
-function generic_fft(x::AbstractVector{T}, region) where {T<:AbstractFloats}
-    @assert all(==(1), region)
-    generic_fft(x)
-end
-
-function generic_fft!(x::AbstractVector{T}, region) where {T<:AbstractFloats}
-    @assert all(==(1), region)
-    copyto!(x, generic_fft(x))
-end
-
 function _generic_fft!(x, Rpre, Rpost)
     for Ipost in Rpost, Ipre in Rpre
         generic_fft!(@view x[Ipre, :, Ipost])
@@ -36,6 +16,7 @@ function _generic_fft!(x, Rpre, Rpost)
 end
 
 function generic_fft!(x, region::Integer)
+    @assert 1 <= region <= ndims(x)
     Rpre = CartesianIndices(size(x)[1:region-1])
     Rpost = CartesianIndices(size(x)[region+1:end])
     _generic_fft!(x, Rpre, Rpost)
@@ -52,6 +33,13 @@ generic_fft(x, region::Integer) = generic_fft!(copy(x), region)
 
 generic_fft(x, region=ntuple(identity, ndims(x))) = generic_fft!(copy(x), region)
 
+function generic_fft!(x::AbstractVector{Complex{T}}) where {T<:AbstractFloat}
+    if ispow2(length(x))
+        return generic_fft_pow2!(x)
+    end
+    return copyto!(x, generic_fft(x))
+end
+
 function generic_fft(x::AbstractVector{T}) where T<:AbstractFloats
     n = length(x)
     ispow2(n) && return generic_fft_pow2(x)
diff --git a/test/fft_tests.jl b/test/fft_tests.jl
index c27d227..c34d30b 100644
--- a/test/fft_tests.jl
+++ b/test/fft_tests.jl
@@ -192,4 +192,13 @@ end
     @test generic_fft(X, (2,1)) ≈ fft(X, (2,1))
     @test X1 ≈ fft(X)
     @test X2 ≈ fft(X)
+
+    N = 32
+    A1 = randn(ComplexF64, N)
+    @allocations generic_fft!(A1)  # compile
+    @test 0 == @allocations generic_fft!(A1)
+
+    A2 = randn(ComplexF64, N, N, N)
+    @allocations generic_fft!(A2)  # compile
+    @test N > @allocations generic_fft!(A2)  # a few allocations is OK
 end

From cb908d0f420645af87d42c60ddcbe196110879b8 Mon Sep 17 00:00:00 2001
From: Henry Gebhardt <gebhardt@caltech.edu>
Date: Mon, 2 Jun 2025 23:19:08 -0700
Subject: [PATCH 6/6] generic_fft!: permute arrays for faster memory access

---
 src/fft.jl        | 71 ++++++++++++++++++++++++++++++++++++++---------
 test/fft_tests.jl |  2 +-
 2 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index e7fe7c5..26910bc 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -8,38 +8,83 @@ const ComplexFloats = Complex{T} where T<:AbstractFloat
 # The following implements Bluestein's algorithm, following http://www.dsprelated.com/dspbooks/mdft/Bluestein_s_FFT_Algorithm.html
 # To add more types, add them in the union of the function's signature.
 
-function _generic_fft!(x, Rpre, Rpost)
-    for Ipost in Rpost, Ipre in Rpre
-        generic_fft!(@view x[Ipre, :, Ipost])
+function generic_fft!(x::AbstractVector{Complex{T}}) where {T<:AbstractFloat}
+    if ispow2(length(x))
+        return generic_fft_pow2!(x)
+    end
+    return copyto!(x, generic_fft(x))
+end
+
+generic_fft!(x::AbstractVector) = copyto!(x, generic_fft(x))
+
+function generic_fft!(x::AbstractVector{Complex{T}}, region::Integer) where {T<:AbstractFloat}
+    @assert region == 1
+    generic_fft!(x)
+end
+
+function _generic_fft_first_dim!(x, Ipost)
+    Threads.@threads for I in Ipost
+        generic_fft!(@view x[:, I])
     end
     x
 end
 
 function generic_fft!(x, region::Integer)
     @assert 1 <= region <= ndims(x)
-    Rpre = CartesianIndices(size(x)[1:region-1])
-    Rpost = CartesianIndices(size(x)[region+1:end])
-    _generic_fft!(x, Rpre, Rpost)
+
+    perm = ntuple(ndims(x)) do i
+        if i == 1
+            region
+        elseif i == region
+            1
+        else
+            i
+        end
+    end
+
+    y = permutedims(x, perm)
+
+    Rright = CartesianIndices(size(y)[2:end])
+    y = _generic_fft_first_dim!(y, Rright)
+
+    permutedims!(x, y, perm)
+    x
 end
 
-function generic_fft!(x, region=ntuple(identity, ndims(x)))
+function generic_fft!(x, region)
     for r in region
         generic_fft!(x, r)
     end
     x
 end
 
-generic_fft(x, region::Integer) = generic_fft!(copy(x), region)
+function generic_fft!(x)
+    y = similar(x)
+    z = x
+    sz = size(x)
+    perm = ((2:ndims(x))..., 1)
 
-generic_fft(x, region=ntuple(identity, ndims(x))) = generic_fft!(copy(x), region)
+    for r in 1:ndims(x)
+        Rright = CartesianIndices(size(z)[2:end])
+        _generic_fft_first_dim!(z, Rright)
 
-function generic_fft!(x::AbstractVector{Complex{T}}) where {T<:AbstractFloat}
-    if ispow2(length(x))
-        return generic_fft_pow2!(x)
+        sz = (sz[2:end]..., sz[1])
+        y = reshape(y, sz)
+        permutedims!(y, z, perm)
+        z, y = y, z
     end
-    return copyto!(x, generic_fft(x))
+
+    if isodd(ndims(x))
+        x .= z
+    end
+    x
 end
 
+
+generic_fft(x, region) = generic_fft!(copy(x), region)
+
+generic_fft(x) = generic_fft!(copy(x))
+
 function generic_fft(x::AbstractVector{T}) where T<:AbstractFloats
     n = length(x)
     ispow2(n) && return generic_fft_pow2(x)
diff --git a/test/fft_tests.jl b/test/fft_tests.jl
index c34d30b..266451e 100644
--- a/test/fft_tests.jl
+++ b/test/fft_tests.jl
@@ -200,5 +200,5 @@ end
 
     A2 = randn(ComplexF64, N, N, N)
     @allocations generic_fft!(A2)  # compile
-    @test N > @allocations generic_fft!(A2)  # a few allocations is OK
+    @test N+150 > @allocations generic_fft!(A2)  # a few allocations is OK
 end