From 8cd3c7fed138a43b8d6f1be84d608c49ef37cb58 Mon Sep 17 00:00:00 2001 From: Kristof Date: Fri, 28 Nov 2025 17:33:09 +0100 Subject: [PATCH 1/4] add align on blocks so that we process on simd aligned to allow for future optimizations and auto vectorization --- src/avx2.rs | 2 +- src/avx2/fdct.rs | 4 ++-- src/encoder.rs | 58 ++++++++++++++++++++++++++++++------------------ src/fdct.rs | 26 +++++++++++----------- src/lib.rs | 8 +++++-- src/writer.rs | 10 ++++----- 6 files changed, 64 insertions(+), 44 deletions(-) diff --git a/src/avx2.rs b/src/avx2.rs index 48aadb8..53a8706 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -9,7 +9,7 @@ pub(crate) struct AVX2Operations; impl Operations for AVX2Operations { #[inline(always)] - fn fdct(data: &mut [i16; 64]) { + fn fdct(data: &mut Block) { fdct_avx2(data); } } diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index b7caa38..071127f 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -57,14 +57,14 @@ const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS; const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS; #[inline(always)] -pub fn fdct_avx2(data: &mut [i16; 64]) { +pub fn fdct_avx2(data: &mut Block) { unsafe { fdct_avx2_internal(data); } } #[target_feature(enable = "avx2")] -unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { +unsafe fn fdct_avx2_internal(data: &mut Block) { #[allow(non_snake_case)] #[inline(always)] unsafe fn PW_F130_F054_MF130_F054() -> __m256i { diff --git a/src/encoder.rs b/src/encoder.rs index eded305..e7ccfa5 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -34,6 +34,24 @@ pub enum JpegColorType { Ycck, } +#[derive(Copy, Clone)] +#[repr(C, align(32))] +pub(crate) struct Block { + pub data: [i16; 64], +} + +impl Block { + pub const fn new(data: [i16; 64]) -> Self { + Block { data } + } +} + +impl Default for Block { + fn default() -> Self { + Block { data: [0i16; 64] } + } +} + impl JpegColorType { pub(crate) fn get_num_components(self) -> usize { use JpegColorType::*; @@ -729,16 +747,14 @@ impl Encoder { &row[i], block_x * 8 * max_h_sampling + (h_offset * 8), v_offset * 8, - max_h_sampling - / component.horizontal_sampling_factor as usize, - max_v_sampling - / component.vertical_sampling_factor as usize, + max_h_sampling / component.horizontal_sampling_factor as usize, + max_v_sampling / component.vertical_sampling_factor as usize, buffer_width, ); OP::fdct(&mut block); - let mut q_block = [0i16; 64]; + let mut q_block = Block::default(); OP::quantize_block( &block, @@ -753,7 +769,7 @@ impl Encoder { &self.huffman_tables[component.ac_huffman_table as usize].1, )?; - prev_dc[i] = q_block[0]; + prev_dc[i] = q_block.data[0]; } } } @@ -813,7 +829,7 @@ impl Encoder { &self.huffman_tables[component.ac_huffman_table as usize].1, )?; - prev_dc = block[0]; + prev_dc = block.data[0]; if restart_interval > 0 { if restarts_to_go == 0 { @@ -869,12 +885,12 @@ impl Encoder { } self.writer.write_dc( - block[0], + block.data[0], prev_dc, &self.huffman_tables[component.dc_huffman_table as usize].0, )?; - prev_dc = block[0]; + prev_dc = block.data[0]; if restart_interval > 0 { if restarts_to_go == 0 { @@ -946,7 +962,7 @@ impl Encoder { &mut self, image: &I, q_tables: &[QuantizationTable; 2], - ) -> [Vec<[i16; 64]>; 4] { + ) -> [Vec; 4] { let width = image.width(); let height = image.height(); @@ -1008,7 +1024,7 @@ impl Encoder { OP::fdct(&mut block); - let mut q_block = [0i16; 64]; + let mut q_block = Block::default(); OP::quantize_block( &block, @@ -1023,7 +1039,7 @@ impl Encoder { blocks } - fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<[i16; 64]>; 4] { + fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec; 4] { // To simplify the code and to give the compiler more infos to optimize stuff we always initialize 4 components // Resource overhead should be minimal because an empty Vec doesn't allocate @@ -1051,7 +1067,7 @@ impl Encoder { } // Create new huffman tables optimized for this image - fn optimize_huffman_table(&mut self, blocks: &[Vec<[i16; 64]>; 4]) { + fn optimize_huffman_table(&mut self, blocks: &[Vec; 4]) { // TODO: Find out if it's possible to reuse some code from the writer let max_tables = self.components.len().min(2) as u8; @@ -1074,7 +1090,7 @@ impl Encoder { debug_assert!(!blocks[i].is_empty()); for block in &blocks[i] { - let value = block[0]; + let value = block.data[0]; let diff = value - prev_dc; let num_bits = get_num_bits(diff); @@ -1106,7 +1122,7 @@ impl Encoder { for block in &blocks[i] { let mut zero_run = 0; - for &value in &block[start..end] { + for &value in &block.data[start..end] { if value == 0 { zero_run += 1; } else { @@ -1132,7 +1148,7 @@ impl Encoder { for block in &blocks[i] { let mut zero_run = 0; - for &value in &block[1..] { + for &value in &block.data[1..] { if value == 0 { zero_run += 1; } else { @@ -1194,7 +1210,7 @@ fn get_block( col_stride: usize, row_stride: usize, width: usize, -) -> [i16; 64] { +) -> Block { let mut block = [0i16; 64]; for y in 0..8 { @@ -1206,7 +1222,7 @@ fn get_block( } } - block + Block::new(block) } fn ceil_div(value: usize, div: usize) -> usize { @@ -1230,15 +1246,15 @@ fn get_num_bits(mut value: i16) -> u8 { pub(crate) trait Operations { #[inline(always)] - fn fdct(data: &mut [i16; 64]) { + fn fdct(data: &mut Block) { fdct(data); } #[inline(always)] - fn quantize_block(block: &[i16; 64], q_block: &mut [i16; 64], table: &QuantizationTable) { + fn quantize_block(block: &Block, q_block: &mut Block, table: &QuantizationTable) { for i in 0..64 { let z = ZIGZAG[i] as usize & 0x3f; - q_block[i] = table.quantize(block[z], z); + q_block.data[i] = table.quantize(block.data[z], z); } } } diff --git a/src/fdct.rs b/src/fdct.rs index 7d0273e..d406e31 100644 --- a/src/fdct.rs +++ b/src/fdct.rs @@ -71,6 +71,8 @@ * scaled fixed-point arithmetic, with a minimal number of shifts. */ +use crate::encoder::Block; + const CONST_BITS: i32 = 13; const PASS1_BITS: i32 = 2; @@ -102,7 +104,9 @@ fn into_el(v: i32) -> i16 { #[allow(clippy::erasing_op)] #[allow(clippy::identity_op)] -pub fn fdct(data: &mut [i16; 64]) { +pub fn fdct(data: &mut Block) { + let data = &mut data.data; + /* Pass 1: process rows. */ /* Note results are scaled up by sqrt(8) compared to a true DCT; */ /* furthermore, we scale the results by 2**PASS1_BITS. */ @@ -134,14 +138,8 @@ pub fn fdct(data: &mut [i16; 64]) { data2[offset + 4] = (tmp10 - tmp11) << PASS1_BITS; let z1 = (tmp12 + tmp13) * FIX_0_541196100; - data2[offset + 2] = descale( - z1 + (tmp13 * FIX_0_765366865), - CONST_BITS - PASS1_BITS, - ); - data2[offset + 6] = descale( - z1 + (tmp12 * -FIX_1_847759065), - CONST_BITS - PASS1_BITS, - ); + data2[offset + 2] = descale(z1 + (tmp13 * FIX_0_765366865), CONST_BITS - PASS1_BITS); + data2[offset + 6] = descale(z1 + (tmp12 * -FIX_1_847759065), CONST_BITS - PASS1_BITS); /* Odd part per figure 8 --- note paper omits factor of sqrt(2). * cK represents cos(K*pi/16). @@ -244,6 +242,8 @@ mod tests { // Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image + use crate::encoder::Block; + use super::fdct; const INPUT1: [i16; 64] = [ @@ -275,12 +275,12 @@ mod tests { #[test] pub fn test_fdct_libjpeg() { - let mut i1 = INPUT1.clone(); + let mut i1 = Block::new(INPUT1.clone()); fdct(&mut i1); - assert_eq!(i1, OUTPUT1); + assert_eq!(i1.data, OUTPUT1); - let mut i2 = INPUT2.clone(); + let mut i2 = Block::new(INPUT2.clone()); fdct(&mut i2); - assert_eq!(i2, OUTPUT2); + assert_eq!(i2.data, OUTPUT2); } } diff --git a/src/lib.rs b/src/lib.rs index 9fc0a0e..48107b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,10 +48,14 @@ pub use image_buffer::{cmyk_to_ycck, rgb_to_ycbcr, ImageBuffer}; pub use quantization::QuantizationTableType; pub use writer::{Density, JfifWrite}; +#[cfg(all( + feature = "benchmark", + feature = "simd", + any(target_arch = "x86", target_arch = "x86_64") +))] +pub use avx2::fdct_avx2; #[cfg(feature = "benchmark")] pub use fdct::fdct; -#[cfg(all(feature = "benchmark", feature = "simd", any(target_arch = "x86", target_arch = "x86_64")))] -pub use avx2::fdct_avx2; #[cfg(test)] mod tests { diff --git a/src/writer.rs b/src/writer.rs index e818b67..876923f 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,4 +1,4 @@ -use crate::encoder::Component; +use crate::encoder::{Block, Component}; use crate::huffman::{CodingClass, HuffmanTable}; use crate::marker::{Marker, SOFType}; use crate::quantization::QuantizationTable; @@ -292,12 +292,12 @@ impl JfifWriter { pub fn write_block( &mut self, - block: &[i16; 64], + block: &Block, prev_dc: i16, dc_table: &HuffmanTable, ac_table: &HuffmanTable, ) -> Result<(), EncodingError> { - self.write_dc(block[0], prev_dc, dc_table)?; + self.write_dc(block.data[0], prev_dc, dc_table)?; self.write_ac_block(block, 1, 64, ac_table) } @@ -317,14 +317,14 @@ impl JfifWriter { pub fn write_ac_block( &mut self, - block: &[i16; 64], + block: &Block, start: usize, end: usize, ac_table: &HuffmanTable, ) -> Result<(), EncodingError> { let mut zero_run = 0; - for &value in &block[start..end] { + for &value in &block.data[start..end] { if value == 0 { zero_run += 1; } else { From f7565606aebde2df9cb482b2086600288d0b437a Mon Sep 17 00:00:00 2001 From: Kristof Date: Fri, 28 Nov 2025 17:36:32 +0100 Subject: [PATCH 2/4] aligned block --- src/encoder.rs | 28 ++++++++++++++-------------- src/fdct.rs | 10 +++++----- src/image_buffer.rs | 22 +++++----------------- src/writer.rs | 6 +++--- 4 files changed, 27 insertions(+), 39 deletions(-) diff --git a/src/encoder.rs b/src/encoder.rs index e7ccfa5..ba0c331 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -36,19 +36,19 @@ pub enum JpegColorType { #[derive(Copy, Clone)] #[repr(C, align(32))] -pub(crate) struct Block { +pub(crate) struct AlignedBlock { pub data: [i16; 64], } -impl Block { +impl AlignedBlock { pub const fn new(data: [i16; 64]) -> Self { - Block { data } + AlignedBlock { data } } } -impl Default for Block { +impl Default for AlignedBlock { fn default() -> Self { - Block { data: [0i16; 64] } + AlignedBlock { data: [0i16; 64] } } } @@ -754,7 +754,7 @@ impl Encoder { OP::fdct(&mut block); - let mut q_block = Block::default(); + let mut q_block = AlignedBlock::default(); OP::quantize_block( &block, @@ -962,7 +962,7 @@ impl Encoder { &mut self, image: &I, q_tables: &[QuantizationTable; 2], - ) -> [Vec; 4] { + ) -> [Vec; 4] { let width = image.width(); let height = image.height(); @@ -1024,7 +1024,7 @@ impl Encoder { OP::fdct(&mut block); - let mut q_block = Block::default(); + let mut q_block = AlignedBlock::default(); OP::quantize_block( &block, @@ -1039,7 +1039,7 @@ impl Encoder { blocks } - fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec; 4] { + fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec; 4] { // To simplify the code and to give the compiler more infos to optimize stuff we always initialize 4 components // Resource overhead should be minimal because an empty Vec doesn't allocate @@ -1067,7 +1067,7 @@ impl Encoder { } // Create new huffman tables optimized for this image - fn optimize_huffman_table(&mut self, blocks: &[Vec; 4]) { + fn optimize_huffman_table(&mut self, blocks: &[Vec; 4]) { // TODO: Find out if it's possible to reuse some code from the writer let max_tables = self.components.len().min(2) as u8; @@ -1210,7 +1210,7 @@ fn get_block( col_stride: usize, row_stride: usize, width: usize, -) -> Block { +) -> AlignedBlock { let mut block = [0i16; 64]; for y in 0..8 { @@ -1222,7 +1222,7 @@ fn get_block( } } - Block::new(block) + AlignedBlock::new(block) } fn ceil_div(value: usize, div: usize) -> usize { @@ -1246,12 +1246,12 @@ fn get_num_bits(mut value: i16) -> u8 { pub(crate) trait Operations { #[inline(always)] - fn fdct(data: &mut Block) { + fn fdct(data: &mut AlignedBlock) { fdct(data); } #[inline(always)] - fn quantize_block(block: &Block, q_block: &mut Block, table: &QuantizationTable) { + fn quantize_block(block: &AlignedBlock, q_block: &mut AlignedBlock, table: &QuantizationTable) { for i in 0..64 { let z = ZIGZAG[i] as usize & 0x3f; q_block.data[i] = table.quantize(block.data[z], z); diff --git a/src/fdct.rs b/src/fdct.rs index d406e31..ff76a16 100644 --- a/src/fdct.rs +++ b/src/fdct.rs @@ -71,7 +71,7 @@ * scaled fixed-point arithmetic, with a minimal number of shifts. */ -use crate::encoder::Block; +use crate::encoder::AlignedBlock; const CONST_BITS: i32 = 13; const PASS1_BITS: i32 = 2; @@ -104,7 +104,7 @@ fn into_el(v: i32) -> i16 { #[allow(clippy::erasing_op)] #[allow(clippy::identity_op)] -pub fn fdct(data: &mut Block) { +pub fn fdct(data: &mut AlignedBlock) { let data = &mut data.data; /* Pass 1: process rows. */ @@ -242,7 +242,7 @@ mod tests { // Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image - use crate::encoder::Block; + use crate::encoder::AlignedBlock; use super::fdct; @@ -275,11 +275,11 @@ mod tests { #[test] pub fn test_fdct_libjpeg() { - let mut i1 = Block::new(INPUT1.clone()); + let mut i1 = AlignedBlock::new(INPUT1.clone()); fdct(&mut i1); assert_eq!(i1.data, OUTPUT1); - let mut i2 = Block::new(INPUT2.clone()); + let mut i2 = AlignedBlock::new(INPUT2.clone()); fdct(&mut i2); assert_eq!(i2.data, OUTPUT2); } diff --git a/src/image_buffer.rs b/src/image_buffer.rs index b176966..3c738ca 100644 --- a/src/image_buffer.rs +++ b/src/image_buffer.rs @@ -122,11 +122,11 @@ impl<'a> ImageBuffer for GrayImage<'a> { } #[inline(always)] -fn get_line(data: &[u8], y: u16, width:u16, num_colors: usize) -> &[u8] { - let width= usize::from(width); +fn get_line(data: &[u8], y: u16, width: u16, num_colors: usize) -> &[u8] { + let width = usize::from(width); let y = usize::from(y); - let start = y *width * num_colors; + let start = y * width * num_colors; let end = start + width * num_colors; &data[start..end] @@ -154,11 +154,7 @@ macro_rules! ycbcr_image { let line = get_line(self.0, y, self.width(), $num_colors); for pixel in line.chunks_exact($num_colors) { - let (y, cb, cr) = rgb_to_ycbcr( - pixel[$o1], - pixel[$o2], - pixel[$o3], - ); + let (y, cb, cr) = rgb_to_ycbcr(pixel[$o1], pixel[$o2], pixel[$o3]); buffers[0].push(y); buffers[1].push(cb); @@ -246,13 +242,7 @@ impl<'a> ImageBuffer for CmykAsYcckImage<'a> { let line = get_line(self.0, y, self.width(), 4); for pixel in line.chunks_exact(4) { - - let (y, cb, cr, k) = cmyk_to_ycck( - pixel[0], - pixel[1], - pixel[2], - pixel[3], - ); + let (y, cb, cr, k) = cmyk_to_ycck(pixel[0], pixel[1], pixel[2], pixel[3]); buffers[0].push(y); buffers[1].push(cb); @@ -281,7 +271,6 @@ impl<'a> ImageBuffer for YcckImage<'a> { let line = get_line(self.0, y, self.width(), 4); for pixel in line.chunks_exact(4) { - buffers[0].push(pixel[0]); buffers[1].push(pixel[1]); buffers[2].push(pixel[2]); @@ -301,7 +290,6 @@ mod tests { #[test] fn test_rgb_to_ycbcr() { - assert_rgb_to_ycbcr([0, 0, 0], [0, 128, 128]); assert_rgb_to_ycbcr([255, 255, 255], [255, 128, 128]); assert_rgb_to_ycbcr([255, 0, 0], [76, 85, 255]); diff --git a/src/writer.rs b/src/writer.rs index 876923f..23f6b43 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,4 +1,4 @@ -use crate::encoder::{Block, Component}; +use crate::encoder::{AlignedBlock, Component}; use crate::huffman::{CodingClass, HuffmanTable}; use crate::marker::{Marker, SOFType}; use crate::quantization::QuantizationTable; @@ -292,7 +292,7 @@ impl JfifWriter { pub fn write_block( &mut self, - block: &Block, + block: &AlignedBlock, prev_dc: i16, dc_table: &HuffmanTable, ac_table: &HuffmanTable, @@ -317,7 +317,7 @@ impl JfifWriter { pub fn write_ac_block( &mut self, - block: &Block, + block: &AlignedBlock, start: usize, end: usize, ac_table: &HuffmanTable, From 644411d4bba449f3271eb7cdd55ff1610a5667bf Mon Sep 17 00:00:00 2001 From: Kristof Date: Fri, 28 Nov 2025 17:42:56 +0100 Subject: [PATCH 3/4] fix avx2 --- src/avx2.rs | 4 ++-- src/avx2/fdct.rs | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/avx2.rs b/src/avx2.rs index a760490..f8eb103 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -1,7 +1,7 @@ mod fdct; mod ycbcr; -use crate::encoder::Operations; +use crate::encoder::{AlignedBlock, Operations}; pub use fdct::fdct_avx2; pub use ycbcr::*; @@ -9,7 +9,7 @@ pub(crate) struct AVX2Operations; impl Operations for AVX2Operations { #[inline(always)] - fn fdct(data: &mut Block) { + fn fdct(data: &mut AlignedBlock) { fdct_avx2(data); } } diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index 071127f..065dab9 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -25,6 +25,8 @@ use core::arch::x86_64::{ _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, }; +use crate::encoder::AlignedBlock; + const CONST_BITS: i32 = 13; const PASS1_BITS: i32 = 2; @@ -57,14 +59,14 @@ const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS; const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS; #[inline(always)] -pub fn fdct_avx2(data: &mut Block) { +pub fn fdct_avx2(data: &mut AlignedBlock) { unsafe { fdct_avx2_internal(data); } } #[target_feature(enable = "avx2")] -unsafe fn fdct_avx2_internal(data: &mut Block) { +unsafe fn fdct_avx2_internal(data: &mut AlignedBlock) { #[allow(non_snake_case)] #[inline(always)] unsafe fn PW_F130_F054_MF130_F054() -> __m256i { @@ -412,7 +414,7 @@ unsafe fn fdct_avx2_internal(data: &mut Block) { (t1, t2, t3, t4) } - let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); + let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.data.as_mut_ptr()); let ymm4 = _mm256_loadu_si256(in_data); let ymm5 = _mm256_loadu_si256(in_data.add(1)); @@ -451,7 +453,7 @@ unsafe fn fdct_avx2_internal(data: &mut Block) { let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 - let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); + let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.data.as_mut_ptr()); _mm256_storeu_si256(out_data, ymm3); _mm256_storeu_si256(out_data.add(1), ymm5); From 1c655d0d5bf55465d466f44bbfd62157f21c5bf5 Mon Sep 17 00:00:00 2001 From: Kristof Date: Sat, 29 Nov 2025 18:58:07 +0100 Subject: [PATCH 4/4] fix merge --- src/avx2/fdct.rs | 41 ++++++++++++++++++++++++++++++----------- src/avx2/ycbcr.rs | 4 +++- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index 9c4abf8..061f098 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -66,7 +66,8 @@ pub fn fdct_avx2(data: &mut AlignedBlock) { } #[target_feature(enable = "avx2")] -unsafe fn fdct_avx2_internal(data: &mut AlignedBlock) { +fn fdct_avx2_internal(data: &mut AlignedBlock) { + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] #[inline] fn PW_F130_F054_MF130_F054() -> __m256i { @@ -421,12 +422,12 @@ unsafe fn fdct_avx2_internal(data: &mut AlignedBlock) { (t1, t2, t3, t4) } - let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.data.as_mut_ptr()); + let data = &mut data.data; - let ymm4 = _mm256_loadu_si256(in_data); - let ymm5 = _mm256_loadu_si256(in_data.add(1)); - let ymm6 = _mm256_loadu_si256(in_data.add(2)); - let ymm7 = _mm256_loadu_si256(in_data.add(3)); + let ymm4 = avx_load(&data[0..16]); + let ymm5 = avx_load(&data[16..32]); + let ymm6 = avx_load(&data[32..48]); + let ymm7 = avx_load(&data[48..64]); // ---- Pass 1: process rows. // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) @@ -460,10 +461,28 @@ unsafe fn fdct_avx2_internal(data: &mut AlignedBlock) { let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 - let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.data.as_mut_ptr()); + avx_store(ymm3, &mut data[0..16]); + avx_store(ymm5, &mut data[16..32]); + avx_store(ymm6, &mut data[32..48]); + avx_store(ymm7, &mut data[48..64]); +} + +/// Safe wrapper for an unaligned AVX load +#[target_feature(enable = "avx2")] +#[inline] +fn avx_load(input: &[i16]) -> __m256i { + assert!(input.len() == 16); + assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); + // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. + unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) } +} - _mm256_storeu_si256(out_data, ymm3); - _mm256_storeu_si256(out_data.add(1), ymm5); - _mm256_storeu_si256(out_data.add(2), ymm6); - _mm256_storeu_si256(out_data.add(3), ymm7); +/// Safe wrapper for an unaligned AVX store +#[target_feature(enable = "avx2")] +#[inline] +fn avx_store(input: __m256i, output: &mut [i16]) { + assert!(output.len() == 16); + assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); + // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. + unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) } } diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 51f0328..b35d329 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -229,7 +229,9 @@ mod tests { for (i, pixel) in scalar_result.iter().copied().enumerate() { let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]]; if pixel != avx_pixel { - panic!("Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"); + panic!( + "Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}" + ); } } }