From e6aa9c94dc774692760d102e3ec36ea572e43c94 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 14:39:27 +0200 Subject: [PATCH 01/80] Bspline filters fixed for CUDA pipeline --- src/algorithm/APRConverter.hpp | 10 +- src/algorithm/ComputeGradient.hpp | 392 +++++++++------------------ src/algorithm/ComputeGradientCuda.cu | 133 +++++---- src/algorithm/bsplineXdir.cuh | 56 ++-- src/algorithm/bsplineYdir.cuh | 110 +++++--- src/algorithm/bsplineZdir.cuh | 55 ++-- src/algorithm/cudaMisc.cuh | 66 +++++ test/CMakeLists.txt | 2 +- test/ComputeGradientCudaTest.cpp | 155 +++++++++++ test/ComputeGradientTest.cpp | 166 +----------- test/TestTools.hpp | 10 +- 11 files changed, 589 insertions(+), 566 deletions(-) create mode 100644 src/algorithm/cudaMisc.cuh create mode 100644 test/ComputeGradientCudaTest.cpp diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index d6728f5f..988bf337 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -233,8 +233,9 @@ void APRConverter::computeL(APR& aAPR,PixelData& input_image){ //////////////////////// fine_grained_timer.start_timer("offset image"); - //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line) + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! if (std::is_same::value) { bspline_offset = 100; @@ -461,8 +462,9 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input ///////////////////////////////// /// Pipeline //////////////////////// - //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line) + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! if (std::is_same::value) { bspline_offset = 100; diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 53c3d7cd..911013b1 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -65,6 +65,20 @@ class ComputeGradient { inline float impulse_resp_back(float k, float rho, float omg, float gamma, float c0); + typedef struct { + std::vector bc1_vec; + std::vector bc2_vec; + std::vector bc3_vec; + std::vector bc4_vec; + size_t k0; + float b1; + float b2; + float norm_factor; + size_t minLen; + } BsplineParams; + + BsplineParams prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen = -1); + }; template @@ -208,81 +222,45 @@ void ComputeGradient::get_smooth_bspline_3D(PixelData& input, float lambda) { inline float ComputeGradient::impulse_resp(float k,float rho,float omg){ // Impulse Response Function - return (pow(rho,(std::abs(k)))*sin((std::abs(k) + 1)*omg)) / sin(omg); + return (powf(rho,(std::abs(k)))*sinf((std::abs(k) + 1)*omg)) / sinf(omg); } inline float ComputeGradient::impulse_resp_back(float k,float rho,float omg,float gamma,float c0){ // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0*pow(rho,std::abs(k))*(cos(omg*std::abs(k)) + gamma*sin(omg*std::abs(k)))*(1.0/(pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2))); -} - - -/** - * floating point output -> no rounding or under-/overflow check - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - return val; -} - -/** - * integer output -> check for under-/overflow and round - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - - val = std::round(val); - - if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { - errCount++; - } - return val; + return c0*powf(rho,std::abs(k))*(cosf(omg*std::abs(k)) + gamma*sinf(omg*std::abs(k)))*(1.0/(powf((1 - 2.0*rho*cosf(omg) + pow(rho,2)),2))); } - - -template -void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { - // - // Bevan Cheeseman 2016 - // - // Recursive Filter Implimentation for Smoothing BSplines +ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen) { + // Recursive Filter Implementation for Smoothing BSplines // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); // eq 4.6 - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); // eq 4.5 - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); // eq 4.6 + float xi = 1 - 96*lambda + 24*lambda * sqrtf(3 + 144*lambda); + float rho = (24*lambda - 1 - sqrtf(xi)) / (24*lambda) * sqrtf((1/xi) * (48*lambda + 24*lambda * sqrtf(3 + 144*lambda))); + float omg = atan(sqrtf((1/xi) * (144*lambda - 1))); + float c0 = (1 + powf(rho,2)) / (1-powf(rho,2)) * (1 - 2*rho * cosf(omg) + powf(rho,2)) / (1 + 2*rho*cosf(omg) + powf(rho,2)); + float gamma = (1 - powf(rho,2)) / (1+powf(rho,2)) * (1 / tan(omg)); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); // eq 4.8 - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); // eq 4.8 + const float b1 = 2*rho*cosf(omg); + const float b2 = -powf(rho,2.0); - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - const size_t z_num = image.z_num; - const size_t x_num = image.x_num; - const size_t y_num = image.y_num; -// const size_t minLen = y_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))),y_num); - - const size_t k0 = k0Len > 0 ? k0Len : (size_t)(ceil(std::abs(log(tol)/log(rho)))); + const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2); + std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUy xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; // for boundaries - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); + std::vector impulse_resp_vec_f(k0+1); //forward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); + std::vector impulse_resp_vec_b(k0+1); //backward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); } std::vector bc1_vec(k0, 0); //forward @@ -291,9 +269,8 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc1_vec[k] += impulse_resp_vec_f[k+1]; } - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc1_vec[minLen-1] += bc1_vec[k]; } @@ -302,8 +279,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc2_vec[k] = impulse_resp_vec_f[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc2_vec[minLen-1] += bc2_vec[k]; } @@ -313,8 +289,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < (k0-1); ++k) { bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0;k++) { bc3_vec[minLen-1] += bc3_vec[k]; } @@ -324,11 +299,64 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 1; k < k0; ++k) { bc4_vec[k] += 2*impulse_resp_vec_b[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc4_vec[minLen-1] += bc4_vec[k]; } + return BsplineParams { + std::move(bc1_vec), + std::move(bc2_vec), + std::move(bc3_vec), + std::move(bc4_vec), + k0, + b1, + b2, + norm_factor, + minLen + }; +} + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + + val = std::round(val); + + if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { + errCount++; + std::cout << val << " " << (float)std::numeric_limits::min() << " " << (float)std::numeric_limits::max() << std::endl; + } + return val; +} + + + +template +void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { + // + // Bevan Cheeseman 2016 + // + // Recursive Filter Implementation for Smoothing BSplines + // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 + + const size_t z_num = image.z_num; + const size_t x_num = image.x_num; + const size_t y_num = image.y_num; + + auto p = prepareBSplineParams(y_num, lambda, tol, k0Len); + APRTimer btime; btime.verbose_flag = false; @@ -350,37 +378,35 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float const size_t iynum = x * y_num; //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp1 += bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; - temp2 += bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp1 += p.bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; + temp2 += p.bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; } //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp3 += bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; - temp4 += bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp3 += p.bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + temp4 += p.bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; } //initialize the sequence - image.mesh[jxnumynum + iynum + 0] = temp2; - image.mesh[jxnumynum + iynum + 1] = temp1; + image.mesh[jxnumynum + iynum + 0] = round(temp2, error_count); + image.mesh[jxnumynum + iynum + 1] = round(temp1, error_count); for (auto it = (image.mesh.begin()+jxnumynum + iynum + 2); it != (image.mesh.begin()+jxnumynum + iynum + y_num); ++it) { - float temp = temp1*b1 + temp2*b2 + *it; + + float temp = temp1*p.b1 + temp2*p.b2 + *it; *it = round(temp, error_count); temp2 = temp1; temp1 = temp; } - image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*norm_factor, error_count); - image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*norm_factor, error_count); - - + image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*p.norm_factor, error_count); + image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*p.norm_factor, error_count); } } btime.stop_timer(); - btime.start_timer("backward_loop_y"); #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) reduction(+: error_count) @@ -391,13 +417,12 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (int64_t i = x_num - 1; i >= 0; --i) { const size_t iynum = i * y_num; - float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/norm_factor; - float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/norm_factor; + float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/p.norm_factor; + float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/p.norm_factor; for (auto it = (image.mesh.begin()+jxnumynum + iynum + y_num-3); it != (image.mesh.begin()+jxnumynum + iynum-1); --it) { - float temp = temp1*b1 + temp2*b2 + *it; - - *it = round(temp*norm_factor, error_count); + float temp = temp1*p.b1 + temp2*p.b2 + *it; + *it = round(temp*p.norm_factor, error_count); temp2 = temp1; temp1 = temp; @@ -417,90 +442,13 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float // // Bevan Cheeseman 2016 // - // Recursive Filter Implimentation for Smoothing BSplines - - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + // Recursive Filter Implementation for Smoothing BSplines const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; - //const size_t minLen = std::min(z_num, std::min(x_num, y_num)); - //const size_t minLen = z_num; - - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), z_num); - const size_t k0 = k0Len > 0 ? k0Len :(size_t)(ceil(std::abs(log(tol)/log(rho)))); - - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUz xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0; k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0; k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1); k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0; k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(z_num, lambda, tol, k0Len); //forwards direction std::vector temp_vec1(y_num,0); @@ -523,18 +471,18 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float size_t iynum = i * y_num; - for (size_t j = 0; j < minLen; ++j) { + for (size_t j = 0; j < p.minLen; ++j) { size_t index = j * x_num * y_num + iynum; #ifdef HAVE_OPENMP #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { //forwards boundary condition - temp_vec1[k] += bc1_vec[j] * image.mesh[index + k]; - temp_vec2[k] += bc2_vec[j] * image.mesh[index + k]; + temp_vec1[k] += p.bc1_vec[j] * image.mesh[index + k]; + temp_vec2[k] += p.bc2_vec[j] * image.mesh[index + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; - temp_vec4[k] += bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec3[k] += p.bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec4[k] += p.bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; } } @@ -557,7 +505,7 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (size_t k = 0; k < y_num; ++k) { - temp_vec2[k] = round(1.0f*image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -568,12 +516,12 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -584,8 +532,8 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; --k) { - float temp = (image.mesh[index + k] + b1*temp_vec3[k] + b2*temp_vec4[k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[k] + p.b2*temp_vec4[k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } @@ -605,85 +553,11 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float // // Recursive Filter Implimentation for Smoothing BSplines - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); - const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; -// const size_t minLen = x_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), x_num); - const size_t k0 = k0Len > 0 ? k0Len : ((size_t)(ceil(std::abs(log(tol)/log(rho))))); - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); - -// std::cout << "CPUx xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0;k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0;k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1);k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0;k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(x_num, lambda, tol, k0Len); //forwards direction @@ -705,15 +579,15 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float size_t jxnumynum = j * y_num * x_num; - for (size_t i = 0; i < minLen; ++i) { + for (size_t i = 0; i < p.minLen; ++i) { for (size_t k = 0; k < y_num; ++k) { //forwards boundary condition - temp_vec1[k] += bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; - temp_vec2[k] += bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec1[k] += p.bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec2[k] += p.bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; - temp_vec4[k] += bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec3[k] += p.bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec4[k] += p.bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; } } @@ -735,7 +609,7 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { - temp_vec2[k] = round(image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -748,12 +622,12 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -764,8 +638,8 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--){ - float temp = (image.mesh[index + k] + b1*temp_vec3[ k]+ b2*temp_vec4[ k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[ k]+ p.b2*temp_vec4[ k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index cf636d5f..982e649c 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -1,28 +1,26 @@ -#include "ComputeGradientCuda.hpp" -#include "APRParameters.hpp" #include -#include +#include +#include +#include #include -#include +#include "ComputeGradientCuda.hpp" +#include "APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" -#include "dsGradient.cuh" - -#include "invBspline.cuh" -#include -#include -#include "bsplineXdir.cuh" -#include "bsplineYdir.cuh" -#include "bsplineZdir.cuh" #include "data_structures/Mesh/downsample.cuh" #include "algorithm/ComputePullingScheme.cuh" -#include "algorithm/LocalIntensityScaleCuda.h" #include "algorithm/LocalIntensityScale.cuh" #include "misc/CudaTools.cuh" #include "misc/CudaMemory.cuh" -#include -#include + +#include "dsGradient.cuh" +#include "invBspline.cuh" +#include "bsplineXdir.cuh" +#include "bsplineYdir.cuh" +#include "bsplineZdir.cuh" + + namespace { typedef struct { @@ -38,45 +36,42 @@ namespace { float impulse_resp(float k, float rho, float omg) { // Impulse Response Function - return (pow(rho, (std::abs(k))) * sin((std::abs(k) + 1) * omg)) / sin(omg); + return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg); } float impulse_resp_back(float k, float rho, float omg, float gamma, float c0) { // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0 * pow(rho, std::abs(k)) * (cos(omg * std::abs(k)) + gamma * sin(omg * std::abs(k))) * - (1.0 / (pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2))); + return c0 * powf(rho, std::abs(k)) * (cosf(omg * std::abs(k)) + gamma * sinf(omg * std::abs(k))) * + (1.0 / (powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2))); } - template - BsplineParams prepareBsplineStuff(const PixelData &image, float lambda, float tol, int maxFilterLen = -1) { + BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 - float xi = 1 - 96 * lambda + 24 * lambda * sqrt(3 + 144 * lambda); // eq 4.6 - float rho = (24 * lambda - 1 - sqrt(xi)) / (24 * lambda) * - sqrt((1 / xi) * (48 * lambda + 24 * lambda * sqrt(3 + 144 * lambda))); // eq 4.5 - float omg = atan(sqrt((1 / xi) * (144 * lambda - 1))); // eq 4.6 + float xi = 1 - 96 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda); // eq 4.6 + float rho = (24 * lambda - 1 - sqrtf(xi)) / (24 * lambda) * + sqrtf((1 / xi) * (48 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda))); // eq 4.5 - float c0 = (1 + pow(rho, 2)) / (1 - pow(rho, 2)) * (1 - 2 * rho * cos(omg) + pow(rho, 2)) / - (1 + 2 * rho * cos(omg) + pow(rho, 2)); // eq 4.8 - float gamma = (1 - pow(rho, 2)) / (1 + pow(rho, 2)) * (1 / tan(omg)); // eq 4.8 + float omg = atan(sqrtf((1 / xi) * (144 * lambda - 1))); // eq 4.6 - const float b1 = 2 * rho * cos(omg); - const float b2 = -pow(rho, 2.0); + float c0 = (1 + powf(rho, 2)) / (1 - powf(rho, 2)) * (1 - 2 * rho * cosf(omg) + powf(rho, 2)) / + (1 + 2 * rho * cosf(omg) + powf(rho, 2)); // eq 4.8 + float gamma = (1 - powf(rho, 2)) / (1 + powf(rho, 2)) * (1 / tan(omg)); // eq 4.8 - const size_t idealK0Len = ceil(std::abs(log(tol) / log(rho))); - const size_t minDimension = std::min(image.z_num, std::min(image.x_num, image.y_num)); - const size_t k0 = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, minDimension); + const float b1 = 2 * rho * cosf(omg); + const float b2 = -powf(rho, 2.0); - const float norm_factor = pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2); - std::cout << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 - << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - // ------- Calculating boundary conditions + const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); + + //std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 + // << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; - // forward boundaries - std::vector impulse_resp_vec_f(k0 + 1); - for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + // ------- Calculating boundary conditions size_t boundaryLen = sizeof(float) * k0; PinnedMemoryUniquePtr bc1{(float*)getPinnedMemory(boundaryLen)}; @@ -84,11 +79,19 @@ namespace { PinnedMemoryUniquePtr bc3{(float*)getPinnedMemory(boundaryLen)}; PinnedMemoryUniquePtr bc4{(float*)getPinnedMemory(boundaryLen)}; + // forward boundaries + std::vector impulse_resp_vec_f(k0 + 1); + for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + //y(0) init for (size_t k = 0; k < k0; ++k) bc1[k] = impulse_resp_vec_f[k]; + for (size_t k = minLen; k < k0; ++k) bc1[minLen - 1] += bc1[k]; + //y(1) init + for (size_t k = 0; k < k0; ++k) bc2[k] = 0; bc2[1] = impulse_resp_vec_f[0]; for (size_t k = 0; k < k0; ++k) bc2[k] += impulse_resp_vec_f[k + 1]; + for (size_t k = minLen; k < k0; ++k) bc2[minLen - 1] += bc2[k]; // backward boundaries std::vector impulse_resp_vec_b(k0 + 1); @@ -96,11 +99,16 @@ namespace { impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); //y(N-1) init + for (size_t k = 0; k < k0; ++k) bc3[k] = 0; bc3[0] = impulse_resp_vec_b[1]; for (size_t k = 0; k < (k0 - 1); ++k) bc3[k + 1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k + 2]; + for (size_t k = minLen; k < k0; ++k) bc3[minLen - 1] += bc3[k]; + //y(N) init + for (size_t k = 0; k < k0; ++k) bc4[k] = 0; bc4[0] = impulse_resp_vec_b[0]; for (size_t k = 1; k < k0; ++k) bc4[k] += 2 * impulse_resp_vec_b[k]; + for (size_t k = minLen; k < k0; ++k) bc4[minLen - 1] += bc4[k]; return BsplineParams{ std::move(bc1), @@ -166,9 +174,9 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); - runBsplineYdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); - runBsplineXdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runBsplineZdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); + runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream); @@ -249,7 +257,9 @@ public: iParameters(parameters), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), - params(prepareBsplineStuff(image, parameters.lambda, tolerance)), + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. + params(prepareBsplineStuff((size_t)image.x_num, parameters.lambda, tolerance)), bc1(params.bc1.get(), params.k0, iStream), bc2(params.bc2.get(), params.k0, iStream), bc3(params.bc3.get(), params.k0, iStream), @@ -336,29 +346,44 @@ template class GpuProcessingTask; // explicit instantiation of handled types template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; - BsplineParams p = prepareBsplineStuff(input, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + ScopedCudaMemHandler, D2H | H2D> cudaInput(input); - APRTimer timer(true); + APRTimer timer(false); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { + BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); + runBsplineYdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { - runBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + runBsplineXdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); } if (flags & BSPLINE_Z_DIR) { - runBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + runBsplineZdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); } timer.stop_timer(); } @@ -404,7 +429,9 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); float tolerance = 0.0001; - BsplineParams p = prepareBsplineStuff(image, par.lambda, tolerance); + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. + BsplineParams p = prepareBsplineStuff(image.x_num, par.lambda, tolerance); ScopedCudaMemHandler bc1 (p.bc1.get(), p.k0); ScopedCudaMemHandler bc2 (p.bc2.get(), p.k0); diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index be0a5f78..6ee3c755 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -5,9 +5,10 @@ #include #include #include +#include "cudaMisc.cuh" /** - * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workes + * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in X-dir. * Next patches are build on a top of first (like patch1 in example below) and they cover * whole y-dimension. Such a setup should be run for every plane in z-direction. @@ -59,22 +60,24 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, +__global__ void bsplineXdir(T *image, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { + float b1, float b2, float norm_factor, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * x_num * y_num; - const size_t nextElementXdirOffset = y_num; - const size_t dirLen = x_num; + const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y; + const size_t nextElementXdirOffset = dim.y; + const size_t dirLen = dim.x; + const size_t minLen = min(dirLen, k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset]; temp1 += bc1[k] * val; temp2 += bc2[k] * val; @@ -83,18 +86,20 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, temp4 += bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = temp1; - image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = temp2; - image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = temp3 * norm_factor; - image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = temp4 * norm_factor; + image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round(temp1, errorCnt); + image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round(temp2, errorCnt); + image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); + image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); // Causal Filter loop int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset; int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -107,13 +112,15 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, offsetLimit = zDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + b1 * temp3 + b2 * temp4; + image[offset] = round(temp * norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementXdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -121,15 +128,26 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineXdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, - (y_num + threadsPerBlockX.y - 1) / threadsPerBlockX.y, - (z_num + threadsPerBlockX.z - 1) / threadsPerBlockX.z); - bsplineXdir <<>> (cudaImage, x_num, y_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y, + (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineXdir <<>>(cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index b9dc2f25..a1026704 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -5,12 +5,14 @@ #include #include #include +#include "cudaMisc.cuh" + /** * Runs bspline recursive filter in Y direction - divided into two phases: * 1. calculate boundary conditions * 2. run recursive filter as a set of 2D patches: - * Each processed 2D patch consist of number of workes + * Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in Y-dir. * Next patches are build on next to it in the x-dir to cover whole x * z domain. * @@ -57,22 +59,25 @@ template -__global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t z_num, +__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec, - size_t k0, float *boundary) { + size_t k0, float norm_factor, float *boundary, bool *error) { const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x; const int xzIndexOfBlock = (blockIdx.x * blockDim.x); const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; - const size_t workersOffset = xzIndexOfBlock * y_num; // per each (x,z) coordinate we have y-row + const size_t workersOffset = xzIndexOfBlock * dim.y; // per each (x,z) coordinate we have y-row + + const int64_t maxXZoffset = dim.x * dim.z; - const int64_t maxXZoffset = x_num * z_num; + const size_t dirLen = dim.y; + const size_t minLen = min(dirLen, k0); extern __shared__ float sharedMem[]; float *bc1_vec2 = &sharedMem[0]; float *bc2_vec2 = &bc1_vec2[k0]; - T *cache = (T*)&bc2_vec2[k0]; + float *cache = (float*)&bc2_vec2[k0]; // Read from global mem to cache for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { @@ -83,18 +88,18 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t int offs = i % k0; int work = i / k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + offs]; + cache[work * k0 + offs] = image[workersOffset + dim.y * work + offs]; } } __syncthreads(); //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp1 = 0; float temp2 = 0; - for (size_t k = 0; k < k0; ++k) { - temp1 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp2 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; } boundary[xzIndexOfWorker*4 + 0] = temp1; boundary[xzIndexOfWorker*4 + 1] = temp2; @@ -111,49 +116,54 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t int offs = i % k0; int work = i / k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + y_num - 1 - offs]; + cache[work * k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } __syncthreads(); + size_t errorCnt = 0; + //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp3 = 0; float temp4 = 0; - for (size_t k = 0; k < k0; ++k) { - temp3 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp4 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; } - boundary[xzIndexOfWorker*4 + 2] = temp3; - boundary[xzIndexOfWorker*4 + 3] = temp4; + boundary[xzIndexOfWorker*4 + 2] = round(temp3 * norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 3] = round(temp4 * norm_factor, errorCnt); } + + if (errorCnt > 0) *error = true; } constexpr int blockWidth = 32; constexpr int numOfThreads = 32; extern __shared__ char sharedMemProcess[]; template -__global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_num, const size_t z_num, size_t k0, - const float b1, const float b2, const float norm_factor, float *boundary) { +__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, + const float b1, const float b2, const float norm_factor, float *boundary, bool *error) { const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; const int xzOffset = blockIdx.x * blockDim.x; - const int64_t maxXZoffset = x_num * z_num; - const int64_t workersOffset = xzOffset * y_num; + const int64_t maxXZoffset = dim.x * dim.z; + const int64_t workersOffset = xzOffset * dim.y; - T (*cache)[blockWidth + 0] = (T (*)[blockWidth + 0]) &sharedMemProcess[0]; + float (*cache)[blockWidth + 0] = (float (*)[blockWidth + 0]) &sharedMemProcess[0]; float temp1, temp2; + size_t errorCnt = 0; // ---------------- forward direction ------------------------------------------- - for (int yBlockBegin = 0; yBlockBegin < y_num - 2; yBlockBegin += blockWidth) { + for (int yBlockBegin = 0; yBlockBegin < dim.y - 2; yBlockBegin += blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work + offs + yBlockBegin]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work + offs + yBlockBegin]; } } __syncthreads(); @@ -166,8 +176,8 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2; } - for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < y_num - 2; ++k) { - float temp = temp1*b2 + temp2*b1 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) { + float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp; temp1 = temp2; temp2 = temp; @@ -179,36 +189,37 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work + offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + image[workersOffset + dim.y * work + offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } // ---------------- backward direction ------------------------------------------- - for (int yBlockBegin = y_num - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { + for (int yBlockBegin = dim.y - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work - offs + yBlockBegin]; + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work - offs + yBlockBegin]; } } __syncthreads(); // Do operations if (xzOffset + currentWorkerId < maxXZoffset) { - if (yBlockBegin == y_num - 1) { - temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3]; - temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2]; + if (yBlockBegin == dim.y - 1) { + temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / norm_factor; + temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / norm_factor; cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; } - for (int64_t k = yBlockBegin == y_num - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { - float temp = temp2*b1 + temp1*b2 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + int64_t k2 = yBlockBegin == dim.y - 1 ? 2 : 0; + for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { + float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; temp1 = temp2; temp2 = temp; @@ -221,25 +232,36 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work - offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + image[workersOffset + dim.y * work - offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } + + if (errorCnt > 0) *error = true; } /** * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineYdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) { dim3 threadsPerBlock(numOfThreads); - dim3 numBlocks((x_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(T); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, boundary); - sharedMemSize = numOfThreads * blockWidth * sizeof(T); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, k0, b1, b2, norm_factor, boundary); + dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); + size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(float); + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, bc1, bc2, bc3,bc4, k0, norm_factor, boundary, error.get()); + sharedMemSize = numOfThreads * blockWidth * sizeof(float); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, k0, b1, b2, norm_factor, boundary, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index 33a5b420..cd59f0fb 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -2,10 +2,12 @@ #define BSPLINE_Z_DIR_H +#include "cudaMisc.cuh" #include #include #include + /** * Runs bspline recursive filter in Z direction. Each processed 2D patch consist of number of workes * (distributed in Y direction) and each of them is handling the whole row in Z-dir. @@ -60,22 +62,24 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, +__global__ void bsplineZdir(T *image, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { + float b1, float b2, float norm_factor, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // x is in 'z' to have good memory coalescing - const size_t nextElementZdirOffset = x_num * y_num; - const size_t dirLen = z_num; + const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing + const size_t nextElementZdirOffset = dim.x * dim.y; + const size_t dirLen = dim.z; + const size_t minLen = min(dirLen, k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset]; temp1 += bc1[k] * val; temp2 += bc2[k] * val; @@ -84,18 +88,20 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, temp4 += bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = temp1; - image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = temp2; - image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = temp3 * norm_factor; - image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = temp4 * norm_factor; + image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round(temp1, errorCnt); + image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round(temp2, errorCnt); + image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); + image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); // Causal Filter loop int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset; int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -108,13 +114,15 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, offsetLimit = xDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + b1 * temp3 + b2 * temp4; + image[offset] = round(temp * norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementZdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -122,15 +130,26 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineZdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, - (y_num + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, - (x_num + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); - bsplineZdir <<>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, + (dim.x + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineZdir <<>> (cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineZdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/cudaMisc.cuh b/src/algorithm/cudaMisc.cuh new file mode 100644 index 00000000..7442c60b --- /dev/null +++ b/src/algorithm/cudaMisc.cuh @@ -0,0 +1,66 @@ +#ifndef CUDAMISC_CUH +#define CUDAMISC_CUH + + +#include + + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +__device__ std::enable_if_t::value, T> round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + * + * CUDA is not supporting std::numeric_limits so this results in belows manual checking of different + * data types range. In theory we could use --expt-relaxed-constexpr flag but since it is experimental + * and without guarantee of long existence for now it is better to stick to belows definitions. + */ +template +__device__ std::enable_if_t::value, uint8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 255) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -128 || val > 127) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 65535) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -32768 || val > 32767) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 4294967295) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -2147483648 || val > 2147483647) { errCount++; } + return val; +} + + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7bc8f6cc..ba468743 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,9 +14,9 @@ buildTarget(testPullingScheme PullingSchemeTest.cpp) #APR GPU Tests if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) + buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) endif() - if(APR_BUILD_EXAMPLES) buildTarget(testExamples ExamplesTest.cpp) endif() diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp new file mode 100644 index 00000000..e20678fd --- /dev/null +++ b/test/ComputeGradientCudaTest.cpp @@ -0,0 +1,155 @@ + +#include + +#include "data_structures/Mesh/PixelData.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "TestTools.hpp" + +namespace { + +#ifdef APR_USE_CUDA + + template + class BsplineTest : public testing::Test {}; + TYPED_TEST_SUITE_P(BsplineTest); + + TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) { + APRTimer timer(true); + + std::vector> yzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p: yzSizes) { + int yLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int xLen = 2; xLen < 22; ++xLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) { + APRTimer timer(true); + + std::vector> xySizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xySizes) { + int xLen = p.first; + int yLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int zLen = 2; zLen < 22; ++zLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInYdirCUDA) { + APRTimer timer(false); + + std::vector> xzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xzSizes) { + int xLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int yLen = 2; yLen < 22; ++yLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); + timer.stop_timer(); + + //Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0001, 2), 0); + } + } + } + + REGISTER_TYPED_TEST_SUITE_P(BsplineTest, testBsplineInXdirCUDA, testBsplineInZdirCUDA, testBsplineInYdirCUDA); + using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>; + INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes); + + +#endif // APR_USE_CUDA + +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 0b2fc17e..c2f14805 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -9,92 +9,9 @@ #include "algorithm/ComputeGradientCuda.hpp" #include #include "algorithm/APRConverter.hpp" +#include "TestTools.hpp" namespace { - /** - * Compares mesh with provided data - * @param mesh - * @param data - data with [Z][Y][X] structure - * @return true if same - */ - template - bool compare(PixelData &mesh, const float *data, const float epsilon) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - bool v = std::abs(mesh(y, x, z) - data[dataIdx]) < epsilon; - if (v == false) { - std::cerr << "Mesh and expected data differ. First place at (Y, X, Z) = " << y << ", " << x - << ", " << z << ") " << mesh(y, x, z) << " vs " << data[dataIdx] << std::endl; - return false; - } - ++dataIdx; - } - } - } - return true; - } - - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } - - template - bool initFromZYXarray(PixelData &mesh, const float *data) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - mesh(y, x, z) = data[dataIdx]; - ++dataIdx; - } - } - } - return true; - } - - TEST(ComputeGradientTest, 2D_XY) { { // Corner points @@ -801,87 +718,6 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeBspineTest, BSPLINE_Y_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_X_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_Z_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { APRTimer timer(true); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 14a71814..5a1d1ca8 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -8,6 +8,8 @@ #include "data_structures/Mesh/PixelData.hpp" #include +#include "data_structures/APR/particles/ParticleData.hpp" + std::string get_source_directory_apr(){ // returns path to the directory where utils.cpp is stored @@ -102,25 +104,27 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y * @param x * @param z * @param multiplier + * @param offset * @return */ template -inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { +inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { PixelData m(y, x, z); std::cout << "Mesh info: " << m << std::endl; std::random_device rd; std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); + #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) #endif for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; + m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier + offset; } return m; } From b563da41f7ee1a9cbad96fbe6819b1cf15aa2da7 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 15:02:35 +0200 Subject: [PATCH 02/80] Debug messages turned off --- src/algorithm/ComputeGradient.hpp | 2 +- test/ComputeGradientCudaTest.cpp | 29 +++++++++++++++++++++++++++-- test/ComputeGradientTest.cpp | 27 --------------------------- test/TestTools.hpp | 4 ++-- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 911013b1..d7876248 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -251,7 +251,7 @@ ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimL const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2); - std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; + // std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; // for boundaries std::vector impulse_resp_vec_f(k0+1); //forward diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index e20678fd..690fd4d6 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -15,7 +15,7 @@ namespace { TYPED_TEST_SUITE_P(BsplineTest); TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) { - APRTimer timer(true); + APRTimer timer(false); std::vector> yzSizes = {{1, 1}, {32, 32}, @@ -57,7 +57,7 @@ namespace { } TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) { - APRTimer timer(true); + APRTimer timer(false); std::vector> xySizes = {{1, 1}, {32, 32}, @@ -144,7 +144,32 @@ namespace { using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>; INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes); + TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 128, 129, 100, 10); + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index c2f14805..4de049fa 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -718,33 +718,6 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 128, 129); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) { using ImgType = float; diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 5a1d1ca8..6d6cd440 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -77,7 +77,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste cnt++; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; + if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; return cnt; } @@ -115,7 +115,7 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD template inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; +// std::cout << "Mesh info: " << m << std::endl; std::random_device rd; std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); From 3db510fba42d80aeed1434620e8dc63e05590d54 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 17:13:54 +0200 Subject: [PATCH 03/80] Fixed Inv Bspline in X direction (CUDA pipeline) --- src/algorithm/invBspline.cuh | 10 +++++++--- test/ComputeGradientCudaTest.cpp | 24 ++++++++++++++++++++++++ test/ComputeGradientTest.cpp | 23 ----------------------- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index d422abf1..c912b054 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -49,21 +49,25 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = (a1 * v2 + a2 * v1 + a3 * v2); for (int x = 2; x < x_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 * v1 + a2 * v2 + a3 * v3); v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 690fd4d6..c63900cd 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -170,6 +170,30 @@ namespace { // Compare GPU vs CPU EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_x(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 4de049fa..2a59d1cd 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -785,29 +785,6 @@ namespace { ASSERT_TRUE(compare(m, expect, 0.01)); } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_x(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { APRTimer timer(true); From 18fce44baf89cebf242e7d0b3db3a194181337af Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 2 Aug 2022 16:29:11 +0200 Subject: [PATCH 04/80] Inverse Bspline pipeline for CUDA fixed --- src/algorithm/ComputeGradient.hpp | 6 +- src/algorithm/invBspline.cuh | 26 ++++--- test/ComputeGradientCudaTest.cpp | 82 +++++++++++++++++++++- test/ComputeGradientTest.cpp | 111 ------------------------------ test/TestTools.hpp | 4 +- 5 files changed, 102 insertions(+), 127 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index d7876248..529af089 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -687,8 +687,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //LHS boundary condition - input.mesh[j*x_num*y_num + i*y_num] = a2*temp_vec[0]; - input.mesh[j*x_num*y_num + i*y_num] += (a1+a3)*temp_vec[1]; + input.mesh[j*x_num*y_num + i*y_num] = a1*temp_vec[1] + a2*temp_vec[0] + a3 * temp_vec[1]; for (int64_t k = 1; k < (y_num-1);k++){ const int64_t idx = j * x_num * y_num + i * y_num + k; @@ -696,8 +695,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //RHS boundary condition - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = (a1+a3)*temp_vec[y_num - 2]; - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] += a2*temp_vec[y_num - 1]; + input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = a1*temp_vec[y_num - 2] + a2*temp_vec[y_num - 1] + a3*temp_vec[y_num - 2]; } } } diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index c912b054..7c27d853 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -9,14 +9,18 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu int workerOffset = workerIdx; int loopNum = 0; - T p = 0; - T v = 0; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + + float p = 0; + float v = 0; bool notLastInRow = true; while (workerOffset < y_num) { if (notLastInRow) v = image[workersOffset + workerOffset]; - T temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); + float temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); p = notLastInRow ? temp : p; - T n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); + float n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); // handle boundary (reflective mode) if (workerOffset == 0) p = n; @@ -24,7 +28,7 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu notLastInRow = (workerIdx + 1 + loopNum) % blockDim.y != 0; if (notLastInRow) { - v = (p + v * 4 + n) / 6.0; + v = a1 * p + a2 * v + a3 * n; image[workersOffset + workerOffset] = v; workerOffset += blockDim.y; } @@ -58,7 +62,7 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (a1 * v2 + a2 * v1 + a3 * v2); + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a3 * v2; for (int x = 2; x < x_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; @@ -87,21 +91,25 @@ __global__ void invBsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = x_num * y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a1 * v2; for (int x = 2; x < z_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = a1 * v1 + a2 * v2 + a3 * v3; v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index c63900cd..81320e80 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -10,6 +10,11 @@ namespace { #ifdef APR_USE_CUDA + + // ======================================================================== + // BSPLINE tests + // ======================================================================== + template class BsplineTest : public testing::Test {}; TYPED_TEST_SUITE_P(BsplineTest); @@ -168,9 +173,14 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } + + // ======================================================================== + // INV. BSPLINE tests + // ======================================================================== + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { APRTimer timer(false); @@ -194,6 +204,76 @@ namespace { EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); } + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(128, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 71, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(32,32,32,100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + ComputeGradient().calc_inv_bspline_x(mCpu); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 2a59d1cd..a03d5746 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -718,120 +718,9 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) { - using ImgType = float; - - ImgType init[] = {1.00, 0.00, 0.00, - 1.00, 0.00, 6.00, - 0.00, 6.00, 0.00, - 6.00, 0.00, 0.00}; - - ImgType expect[] = {1.00, 0.00, 2.00, - 0.83, 1.00, 4.00, - 1.17, 4.00, 1.00, - 4.00, 2.00, 0.00}; - PixelData m(4, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_Y_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { - APRTimer timer(true); - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 33, 31); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_CUDA) { - using ImgType = float; - - ImgType init[] = {0.00, 6.00, 0.00, - 1.00, 0.00, 0.00, - 0.00, 0.00, 1.00}; - - ImgType expect[] = {2.00, 4.00, 2.00, - 0.67, 0.16, 0.00, - 0.00, 0.16, 0.67}; - - PixelData m(3, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_X_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(3,3,3,100); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - ComputeGradient().calc_inv_bspline_x(mCpu); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { APRTimer timer(true); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 6d6cd440..d8b99fc7 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -72,7 +72,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste for (size_t i = 0; i < expected.mesh.size(); ++i) { if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } @@ -93,7 +93,7 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD for (size_t i = 0; i < expected.size(); ++i) { if (std::abs(expected[i] - tested[i]) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; } cnt++; } From ad5f194006661569bd62a0078b4b0d274ccc47a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 3 Aug 2022 16:39:23 +0200 Subject: [PATCH 05/80] Downsample and downsample gradient corrected to match GPU --- src/algorithm/ComputeGradient.hpp | 20 +++++--- src/algorithm/ComputeGradientCuda.cu | 8 +-- src/algorithm/bsplineYdir.cuh | 1 - src/algorithm/dsGradient.cuh | 29 ++++++----- test/ComputeGradientCudaTest.cpp | 31 ++++++++++++ test/ComputeGradientTest.cpp | 74 ---------------------------- test/MeshDataTest.cpp | 51 ++----------------- 7 files changed, 68 insertions(+), 146 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 529af089..80b72a36 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -887,11 +887,15 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat //compute the boundary values if (y_num >= 2) { - temp[0] = sqrt(pow((right[0] - left[0]) / (2 * hx), 2.0) + pow((down[0] - up[0]) / (2 * hz), 2.0) + - pow((center[1] - center[0 /* boundary */]) / (2 * hy), 2.0)); - temp[y_num - 1] = sqrt(pow((right[y_num - 1] - left[y_num - 1]) / (2 * hx), 2.0) + - pow((down[y_num - 1] - up[y_num - 1]) / (2 * hz), 2.0) + - pow((center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy), 2.0)); + float dx = (right[0] - left[0]) / (2 * hx); + float dz = (down[0] - up[0]) / (2 * hz); + float dy = (center[1] - center[0 /* boundary */]) / (2 * hy); + temp[0] = sqrtf(dx*dx + dz*dz + dy*dy); + + dx = (right[y_num - 1] - left[y_num - 1]) / (2 * hx); + dz = (down[y_num - 1] - up[y_num - 1]) / (2 * hz); + dy = (center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy); + temp[y_num - 1] = sqrtf(dx*dx + dz*dz + dy*dy); } else { temp[0] = 0; // same values minus same values in x/y/z } @@ -901,8 +905,10 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat #pragma omp simd #endif for (size_t y = 1; y < y_num - 1; ++y) { - temp[y] = sqrt(pow((right[y] - left[y]) / (2 * hx), 2.0) + pow((down[y] - up[y]) / (2 * hz), 2.0) + - pow((center[y + 1] - center[y - 1]) / (2 * hy), 2.0)); + float dx = (right[y] - left[y]) / (2 * hx); + float dz = (down[y] - up[y]) / (2 * hz); + float dy = (center[y + 1] - center[y - 1]) / (2 * hy); + temp[y] = sqrtf(dx*dx + dz*dz + dy*dy); } // Set as a downsampled gradient maximum from 2x2x2 gradient cubes diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 982e649c..97dcd5b0 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -172,13 +172,13 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); + //runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream); + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); @@ -186,7 +186,7 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); + //runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); } class CurrentTime { @@ -468,5 +468,5 @@ void cudaDownsampledGradient(PixelData &input, PixelData &grad, co ScopedCudaMemHandler, H2D | D2H> cudaInput(input); ScopedCudaMemHandler, D2H> cudaGrad(grad); - runKernelGradient(cudaInput.get(), cudaGrad.get(), input.x_num, input.y_num, input.z_num, grad.x_num, grad.y_num, hx, hy, hz, 0); + runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, 0); } diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index a1026704..1a0986d1 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -217,7 +217,6 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; } - int64_t k2 = yBlockBegin == dim.y - 1 ? 2 : 0; for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; diff --git a/src/algorithm/dsGradient.cuh b/src/algorithm/dsGradient.cuh index de4a2c77..8e2efc84 100644 --- a/src/algorithm/dsGradient.cuh +++ b/src/algorithm/dsGradient.cuh @@ -5,11 +5,14 @@ template __global__ void -gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size_t x_num_ds, size_t y_num_ds, - float hx, float hy, float hz) { +gradient(const T *input, PixelDataDim inputDim, T *grad, PixelDataDim gradDim, float hx, float hy, float hz) { const int xi = ((blockIdx.x * blockDim.x) + threadIdx.x) * 2; const int yi = ((blockIdx.y * blockDim.y) + threadIdx.y) * 2; const int zi = ((blockIdx.z * blockDim.z) + threadIdx.z) * 2; + const auto x_num = inputDim.x; + const auto y_num = inputDim.y; + const auto z_num = inputDim.z; + if (xi >= x_num || yi >= y_num || zi >= z_num) return; const size_t xnumynum = x_num * y_num; @@ -33,28 +36,28 @@ gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size for (int y = 1; y <= 2; ++y) { float xd = (temp[z][x - 1][y] - temp[z][x + 1][y]) / (2 * hx); xd = xd * xd; - float yd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hy); - yd = yd * yd; - float zd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hz); + float zd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hz); zd = zd * zd; - float gm = __fsqrt_rn(xd + yd + zd); + float yd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hy); + yd = yd * yd; + float gm = sqrtf(xd + zd + yd); if (gm > maxGrad) maxGrad = gm; } - const size_t idx = zi / 2 * x_num_ds * y_num_ds + xi / 2 * y_num_ds + yi / 2; + const size_t idx = zi / 2 * gradDim.x * gradDim.y + xi / 2 * gradDim.y + yi / 2; grad[idx] = maxGrad; } template void runKernelGradient(const T *cudaInput, T *cudaGrad, - size_t xLenInput, size_t yLenInput, size_t zLenInput, - size_t xLenGradient, size_t yLenGradient, + PixelDataDim inputDim, + PixelDataDim gradDim, float hx, float hy, float hz, cudaStream_t aStream) { dim3 threadsPerBlock(1, 64, 1); - dim3 numBlocks((xLenInput + threadsPerBlock.x - 1) / threadsPerBlock.x, - (yLenInput + threadsPerBlock.y - 1) / threadsPerBlock.y, - (zLenInput + threadsPerBlock.z - 1) / threadsPerBlock.z); - gradient <<>> (cudaInput, xLenInput, yLenInput, zLenInput, cudaGrad, xLenGradient, yLenGradient, hx, hy, hz); + dim3 numBlocks((inputDim.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (inputDim.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (inputDim.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + gradient <<>> (cudaInput, inputDim, cudaGrad, gradDim, hx, hy, hz); } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 81320e80..d7fc6e62 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -274,6 +274,37 @@ namespace { // Compare GPU vs CPU EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } + + // ======================================================================== + // Downsampled gradient + // ======================================================================== + + TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(31, 32, 33, 100); + + // Calculate gradient on CPU + PixelData grad; + grad.initDownsampled(m, 0); + timer.start_timer("CPU gradient"); + ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); + timer.stop_timer(); + + // Calculate gradient on GPU + PixelData gradCuda; + gradCuda.initDownsampled(m, 0); + timer.start_timer("GPU gradient"); + cudaDownsampledGradient(m, gradCuda, 1, 1, 1); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(grad, gradCuda, 0.0000001), 0); + } + + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index a03d5746..d94f74c0 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -648,80 +648,6 @@ namespace { #ifdef APR_USE_CUDA - TEST(ComputeGradientTest, 2D_XY_CUDA) { - // Corner points - PixelData m(6, 6, 1, 0); - // expect gradient is 3x3 X/Y plane - float expect[] = {1.41, 0, 4.24, - 0, 0, 0, - 2.82, 0, 5.65}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, Corners3D_CUDA) { - PixelData m(6, 6, 4, 0); - // expect gradient is 3x3x2 X/Y/Z plane - float expect[] = {1.73, 0, 5.19, - 0, 0, 0, - 3.46, 0, 6.92, - - 8.66, 0, 12.12, - 0, 0, 0, - 10.39, 0, 13.85}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - m(0, 0, 3) = 10; - m(5, 0, 3) = 12; - m(0, 5, 3) = 14; - m(5, 5, 3) = 16; - - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { - // Generate random mesh - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(33, 31, 3); - - APRTimer timer(true); - - // Calculate gradient on CPU - PixelData grad; - grad.initDownsampled(m, 0); - timer.start_timer("CPU gradient"); - ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); - timer.stop_timer(); - - // Calculate gradient on GPU - PixelData gradCuda; - gradCuda.initDownsampled(m, 0); - timer.start_timer("GPU gradient"); - cudaDownsampledGradient(m, gradCuda, 1, 1, 1); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(grad, gradCuda), 0); - } - - - - - TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { APRTimer timer(true); diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index 869229e3..f9c9bf4b 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -5,6 +5,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include +#include "TestTools.hpp" namespace { @@ -675,51 +676,7 @@ namespace { } #ifdef APR_USE_CUDA -namespace { - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } -} TEST(MeshDataSimpleTest, DownSampleCuda) { { // reduce/constant_operator calculate maximum value when downsampling PixelData m(5, 6, 4); @@ -773,10 +730,10 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } { - APRTimer timer(true); + APRTimer timer(false); // reduce/constant_operator calculate average value of pixels when downsampling - PixelData m = getRandInitializedMesh(33, 22, 21); + PixelData m = getRandInitializedMesh(33, 22, 21, 100, 5); for (size_t i = 0; i < m.mesh.size(); ++i) m.mesh[i] = 27 - i; PixelData mCpu; mCpu.initDownsampled(m); @@ -792,7 +749,7 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { downsampleMeanCuda(m, mGpu); timer.stop_timer(); - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } } #endif From 557eff32759a4c5b5013607082e9ffe01ef8bcb5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 9 Aug 2022 17:25:58 +0200 Subject: [PATCH 06/80] GPU pipeline fixes - Full Gradient test is working now --- src/algorithm/ComputeGradient.hpp | 1 - src/algorithm/ComputeGradientCuda.cu | 109 ++++++++++++++++-------- src/algorithm/bsplineParams.h | 19 +++++ src/algorithm/bsplineXdir.cuh | 31 +++---- src/algorithm/bsplineYdir.cuh | 79 +++++++++-------- src/algorithm/bsplineZdir.cuh | 33 ++++--- src/data_structures/Mesh/PixelData.hpp | 11 +-- src/data_structures/Mesh/downsample.cuh | 4 +- src/misc/CudaTools.cuh | 42 +++++++-- test/ComputeGradientCudaTest.cpp | 56 +++++++++++- test/ComputeGradientTest.cpp | 55 +----------- 11 files changed, 260 insertions(+), 180 deletions(-) create mode 100644 src/algorithm/bsplineParams.h diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 80b72a36..ee5aeec8 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -129,7 +129,6 @@ inline void ComputeGradient::get_gradient(PixelData &image_temp, Pixe timer.stop_timer(); } } - } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 97dcd5b0..99f28558 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -16,6 +16,7 @@ #include "dsGradient.cuh" #include "invBspline.cuh" +#include "bsplineParams.h" #include "bsplineXdir.cuh" #include "bsplineYdir.cuh" #include "bsplineZdir.cuh" @@ -34,6 +35,13 @@ namespace { float norm_factor; } BsplineParams; + struct BsplineParamsCudaMemoryHandlers { + ScopedCudaMemHandler bc1; + ScopedCudaMemHandler bc2; + ScopedCudaMemHandler bc3; + ScopedCudaMemHandler bc4; + }; + float impulse_resp(float k, float rho, float omg) { // Impulse Response Function return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg); @@ -169,14 +177,12 @@ void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, flo template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, - BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary, + BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - //runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); - - runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); - runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + runBsplineXdir(cudaImage, image.getDimension(), px, aStream); + runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); @@ -185,8 +191,6 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - - //runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); } class CurrentTime { @@ -292,9 +296,12 @@ public: void processOnGpu() { CurrentTime ct; uint64_t start = ct.microseconds(); - getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), - params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), - iBsplineOffset, iParameters, iStream); + + // TODO: Need to be fixed !!!!!!!!!!1 + +// getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), +// params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), +// iBsplineOffset, iParameters, iStream); std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); std::cout << "2: " << ct.microseconds() - start << std::endl; @@ -350,6 +357,33 @@ template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfR template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +auto transferSpline(BsplineParams &aParams) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + + return std::pair { + BsplineParamsCuda { + bc1.get(), + bc2.get(), + bc3.get(), + bc4.get(), + aParams.k0, + aParams.b1, + aParams.b2, + aParams.norm_factor + }, + + BsplineParamsCudaMemoryHandlers { + std::move(bc1), + std::move(bc2), + std::move(bc3), + std::move(bc4) + } + }; +} + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; @@ -361,29 +395,23 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); + runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); - runBsplineXdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; + runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } if (flags & BSPLINE_Z_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); - runBsplineZdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; + runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } timer.stop_timer(); } @@ -421,6 +449,8 @@ void computeLevelsCuda(const PixelData &grad_temp, PixelData & // explicit instantiation of handled types template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); +template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); + template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par) { ScopedCudaMemHandler, D2H | H2D> cudaImage(image); @@ -428,21 +458,30 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp); ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); + int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; + ScopedCudaMemHandler boundary(nullptr, boundaryLen); + float tolerance = 0.0001; + + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - BsplineParams p = prepareBsplineStuff(image.x_num, par.lambda, tolerance); - ScopedCudaMemHandler bc1 (p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2 (p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3 (p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4 (p.bc4.get(), p.k0); - int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); + // FIX BSPLINE PARAMS !!!!!!!! to get full gradient pipeline test working !!!!!!!!!!!!!!!!!!!!!!!!!1 + + + BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance); + auto cudax = transferSpline(px); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance); + auto cuday = transferSpline(py); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); + auto cudaz = transferSpline(pz); + auto splineCudaZ = cudaz.first; getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - p, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), - bspline_offset, par, 0); + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); } // explicit instantiation of handled types diff --git a/src/algorithm/bsplineParams.h b/src/algorithm/bsplineParams.h new file mode 100644 index 00000000..44dbd1c1 --- /dev/null +++ b/src/algorithm/bsplineParams.h @@ -0,0 +1,19 @@ +#ifndef APR_BSPLINEPARAMS_H +#define APR_BSPLINEPARAMS_H + + +#include + + +struct BsplineParamsCuda { + float *bc1; + float *bc2; + float *bc3; + float *bc4; + size_t k0; + float b1; + float b2; + float norm_factor; +}; + +#endif //APR_BSPLINEPARAMS_H diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index 6ee3c755..89fd3fc6 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -6,6 +6,7 @@ #include #include #include "cudaMisc.cuh" +#include "bsplineParams.h" /** * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers @@ -60,15 +61,13 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineXdir(T *image, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor, bool *error) { +__global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y; const size_t nextElementXdirOffset = dim.y; const size_t dirLen = dim.x; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); if (yDirOffset < dim.y) { float temp1 = 0; @@ -79,11 +78,11 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, // calculate boundary values for (int k = 0; k < minLen; ++k) { T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[zDirOffset + (dirLen - 1 - k) * nextElementXdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } size_t errorCnt = 0; @@ -91,15 +90,15 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, // set boundary values in two first and two last points processed direction image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round(temp1, errorCnt); image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round(temp2, errorCnt); - image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); - image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); + image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset; int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -112,8 +111,8 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, offsetLimit = zDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = image[offset] + b1 * temp3 + b2 * temp4; - image[offset] = round(temp * norm_factor, errorCnt); + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; @@ -128,9 +127,7 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, @@ -141,7 +138,7 @@ void runBsplineXdir(T *cudaImage, PixelDataDim dim, bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineXdir <<>>(cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + bsplineXdir <<>>(cudaImage, dim, p, error.get()); } if (isErrorDetected) { diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index 1a0986d1..e8aa5bdf 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -6,6 +6,7 @@ #include #include #include "cudaMisc.cuh" +#include "bsplineParams.h" /** @@ -59,9 +60,7 @@ template -__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, - const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec, - size_t k0, float norm_factor, float *boundary, bool *error) { +__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x; const int xzIndexOfBlock = (blockIdx.x * blockDim.x); @@ -72,23 +71,23 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, const int64_t maxXZoffset = dim.x * dim.z; const size_t dirLen = dim.y; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); extern __shared__ float sharedMem[]; float *bc1_vec2 = &sharedMem[0]; - float *bc2_vec2 = &bc1_vec2[k0]; - float *cache = (float*)&bc2_vec2[k0]; + float *bc2_vec2 = &bc1_vec2[p.k0]; + float *cache = (float*)&bc2_vec2[p.k0]; // Read from global mem to cache - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc1_vec[i]; - bc2_vec2[i] = bc2_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc1[i]; + bc2_vec2[i] = p.bc2[i]; } - int offs = i % k0; - int work = i / k0; + int offs = i % p.k0; + int work = i / p.k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + dim.y * work + offs]; + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs]; } } __syncthreads(); @@ -98,8 +97,8 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, float temp1 = 0; float temp2 = 0; for (size_t k = 0; k < minLen; ++k) { - temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; - temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } boundary[xzIndexOfWorker*4 + 0] = temp1; boundary[xzIndexOfWorker*4 + 1] = temp2; @@ -108,15 +107,15 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, // ----------------- second end __syncthreads(); - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc3_vec[i]; - bc2_vec2[i] = bc4_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc3[i]; + bc2_vec2[i] = p.bc4[i]; } - int offs = i % k0; - int work = i / k0; + int offs = i % p.k0; + int work = i / p.k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } __syncthreads(); @@ -128,11 +127,11 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, float temp3 = 0; float temp4 = 0; for (size_t k = 0; k < minLen; ++k) { - temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; - temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } - boundary[xzIndexOfWorker*4 + 2] = round(temp3 * norm_factor, errorCnt); - boundary[xzIndexOfWorker*4 + 3] = round(temp4 * norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 2] = round(temp3 * p.norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 3] = round(temp4 * p.norm_factor, errorCnt); } if (errorCnt > 0) *error = true; @@ -142,8 +141,7 @@ constexpr int blockWidth = 32; constexpr int numOfThreads = 32; extern __shared__ char sharedMemProcess[]; template -__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, - const float b1, const float b2, const float norm_factor, float *boundary, bool *error) { +__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; const int xzOffset = blockIdx.x * blockDim.x; @@ -177,7 +175,7 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2; } for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) { - float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp; temp1 = temp2; temp2 = temp; @@ -212,14 +210,14 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, // Do operations if (xzOffset + currentWorkerId < maxXZoffset) { if (yBlockBegin == dim.y - 1) { - temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / norm_factor; - temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / norm_factor; - cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; - cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; + temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / p.norm_factor; + temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / p.norm_factor; + cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = p.norm_factor * temp1; + cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = p.norm_factor * temp2; } for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { - float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; - cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * p.norm_factor; temp1 = temp2; temp2 = temp; } @@ -244,18 +242,17 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) { +void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) { + dim3 threadsPerBlock(numOfThreads); dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); - size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(float); + size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, bc1, bc2, bc3,bc4, k0, norm_factor, boundary, error.get()); + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); sharedMemSize = numOfThreads * blockWidth * sizeof(float); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, k0, b1, b2, norm_factor, boundary, error.get()); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); } if (isErrorDetected) { diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index cd59f0fb..c8ba6688 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -2,10 +2,11 @@ #define BSPLINE_Z_DIR_H -#include "cudaMisc.cuh" #include #include #include +#include "cudaMisc.cuh" +#include "bsplineParams.h" /** @@ -62,15 +63,13 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineZdir(T *image, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor, bool *error) { +__global__ void bsplineZdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing const size_t nextElementZdirOffset = dim.x * dim.y; const size_t dirLen = dim.z; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); if (yDirOffset < dim.y) { float temp1 = 0; @@ -81,11 +80,11 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, // calculate boundary values for (int k = 0; k < minLen; ++k) { T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[xDirOffset + (dirLen - 1 - k) * nextElementZdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } size_t errorCnt = 0; @@ -93,15 +92,15 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, // set boundary values in two first and two last points processed direction image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round(temp1, errorCnt); image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round(temp2, errorCnt); - image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); - image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); + image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset; int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -114,8 +113,8 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, offsetLimit = xDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = image[offset] + b1 * temp3 + b2 * temp4; - image[offset] = round(temp * norm_factor, errorCnt); + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; @@ -130,9 +129,7 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, @@ -143,7 +140,7 @@ void runBsplineZdir(T *cudaImage, PixelDataDim dim, bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineZdir <<>> (cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + bsplineZdir <<>> (cudaImage, dim, p, error.get()); } if (isErrorDetected) { diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 931b95a3..13264ec4 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -950,15 +950,16 @@ void downsample(const PixelData &aInput, PixelData &aOutput, R reduce, C c const size_t shy = std::min(2*y + 1, y_num - 1); const size_t idx = z * x_num_ds * y_num_ds + x * y_num_ds + y; outMesh[idx] = constant_operator( - reduce(reduce(reduce(reduce(reduce(reduce(reduce( // inMesh coordinates + reduce(reduce(reduce(reduce( // inMesh coordinates inMesh[2*z * x_num * y_num + 2*x * y_num + 2*y], // z, x, y - inMesh[2*z * x_num * y_num + 2*x * y_num + shy]), // z, x, y+1 inMesh[2*z * x_num * y_num + shx * y_num + 2*y]), // z, x+1, y - inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 inMesh[shz * x_num * y_num + 2*x * y_num + 2*y]), // z+1, x, y - inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 inMesh[shz * x_num * y_num + shx * y_num + 2*y]), // z+1, x+1, y - inMesh[shz * x_num * y_num + shx * y_num + shy]) // z+1, x+1, y+1 + reduce(reduce(reduce( + inMesh[2*z * x_num * y_num + 2*x * y_num + shy], // z, x, y+1 + inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 + inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 + inMesh[shz * x_num * y_num + shx * y_num + shy])) // z+1, x+1, y+1 ); } } diff --git a/src/data_structures/Mesh/downsample.cuh b/src/data_structures/Mesh/downsample.cuh index 947db945..a6548a52 100644 --- a/src/data_structures/Mesh/downsample.cuh +++ b/src/data_structures/Mesh/downsample.cuh @@ -24,14 +24,14 @@ __global__ void downsampleMean(const T *input, S *output, size_t x_num, size_t y size_t idx = (zi * x_num + xi) * y_num + yi; // Go through all elements in 2x2 - T v = input[idx]; + S v = input[idx]; v += input[idx + xs * y_num]; v += input[idx + zs * x_num * y_num]; v += input[idx + xs * y_num + zs * x_num * y_num]; // Get data from odd thread to even one const int workerIdx = threadIdx.y; - T a = __shfl_sync(__activemask(), v, workerIdx + 1); + S a = __shfl_sync(__activemask(), v, workerIdx + 1); // downsampled dimensions twice smaller (rounded up) diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 3f9b5fca..558f730a 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -8,16 +8,25 @@ #include #include -//#include #include -//#include - - #include #include + #include "data_structures/Mesh/PixelData.hpp" +#define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +#endif +} + inline void waitForCuda() { cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); @@ -211,6 +220,17 @@ public: initialize(); } + ScopedCudaMemHandler (ScopedCudaMemHandler &&obj) { + iData = obj.iData; + obj.iData = nullptr; + iSize = obj.iSize; + obj.iSize = 0; + iBytes = obj.iBytes; + obj.iBytes = 0; + iStream = obj.iStream; + obj.iStream = nullptr; + iCudaMemory = std::move(obj.iCudaMemory); + } ~ScopedCudaMemHandler() { if (DIRECTION & D2H) { @@ -223,15 +243,21 @@ public: size_t getNumOfBytes() const {return iBytes; } void copyH2D() { - cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream)); + } } void copyH2D(const size_t numElements) { - cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream)); + } } void copyD2H() { - cudaMemcpyAsync((void*)iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync((void *) iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream)); + } } private: @@ -240,7 +266,7 @@ private: void initialize() { ElementType *mem = nullptr; - cudaMalloc(&mem, iBytes); + checkCuda(cudaMalloc(&mem, iBytes)); iCudaMemory.reset(mem); if (DIRECTION & H2D) { copyH2D(); diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index d7fc6e62..8bb06106 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -279,7 +279,7 @@ namespace { // Downsampled gradient // ======================================================================== - TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { + TEST(ComputeGradientTest, GPU_VS_CPU_DOWNSAMPLE_GRADIENT_ON_RANDOM_VALUES) { APRTimer timer(false); // Generate random mesh @@ -305,6 +305,60 @@ namespace { } + // ======================================================================== + // Full pipeline/gradient tests + // ======================================================================== + + TEST(ComputeThreshold, FULL_GRADIENT_TEST) { + APRTimer timer(false); + + // Generate random mesh + using ImageType = uint16_t; + PixelData input_image = getRandInitializedMesh(11, 13, 15, 15, 20); + PixelData &image_temp = input_image; + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(image_temp, true); + + ComputeGradient computeGradient; + + timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); + computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpuImage(image_temp, true); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0.0000001), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.0000001), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.0000001), 0); + } + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index d94f74c0..527815f0 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -701,58 +701,9 @@ namespace { EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } - // TODO: These two test will be fixed as soon as CUDA pipeline is updated. - // Currently turning them off to have testable rest of CUDA impl. -// TEST(ComputeThreshold, FULL_GRADIENT_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 13, 25); -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// -// ComputeGradient computeGradient; -// -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); -// getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage), 0); -// EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.1), 0); -// EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU), 0); -// } -// + + // TODO: This test will be fixed as soon as CUDA pipeline is updated. + // Currently turning it off to have testable rest of CUDA impl. // TEST(ComputeThreshold, FULL_PIPELINE_TEST) { // APRTimer timer(true); // From d958161cf3f31289f1f83fab86195bd0aa2ae2ec Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 10 Aug 2022 10:53:44 +0200 Subject: [PATCH 07/80] GPU and CPU give same resutls in Release mode - turned off unsafe optimizations --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4912458..4513e07f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,14 +170,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") if(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math -fno-unsafe-math-optimizations") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic") if(NOT WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz") endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math -fno-unsafe-math-optimizations") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz") endif() @@ -207,8 +207,8 @@ if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK") From 4ace2385c4d3be68c6cab93da3d597cf3d3e618b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 10 Aug 2022 16:10:57 +0200 Subject: [PATCH 08/80] Quick fix of processOnGpu() - not it gets correct bspline data for each direction --- src/algorithm/ComputeGradientCuda.cu | 75 ++++++++++++++++------------ 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 99f28558..0a6e5507 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -129,6 +129,33 @@ namespace { norm_factor }; } + + auto transferSpline(BsplineParams &aParams) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + + return std::pair { + BsplineParamsCuda { + bc1.get(), + bc2.get(), + bc3.get(), + bc4.get(), + aParams.k0, + aParams.b1, + aParams.b2, + aParams.norm_factor + }, + + BsplineParamsCudaMemoryHandlers { + std::move(bc1), + std::move(bc2), + std::move(bc3), + std::move(bc4) + } + }; + } } /** @@ -297,11 +324,22 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); - // TODO: Need to be fixed !!!!!!!!!!1 - -// getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), -// params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), -// iBsplineOffset, iParameters, iStream); + // TODO: temporarily bspline params are generated here + // In principle this is OK and correct but would be faster (for processing series of same size images) if + // they would be calculated in constructor of GpuProcessingTaskImpl class (once). + BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); + auto cudax = transferSpline(px); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); + auto cuday = transferSpline(py); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); + auto cudaz = transferSpline(pz); + auto splineCudaZ = cudaz.first; + + getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), + iBsplineOffset, iParameters, iStream); std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); std::cout << "2: " << ct.microseconds() - start << std::endl; @@ -357,32 +395,7 @@ template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfR template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); -auto transferSpline(BsplineParams &aParams) { - ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); - ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); - ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); - ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); - - return std::pair { - BsplineParamsCuda { - bc1.get(), - bc2.get(), - bc3.get(), - bc4.get(), - aParams.k0, - aParams.b1, - aParams.b2, - aParams.norm_factor - }, - - BsplineParamsCudaMemoryHandlers { - std::move(bc1), - std::move(bc2), - std::move(bc3), - std::move(bc4) - } - }; -} + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { From b050e07d706f73f84b0452307693f42ebb1a39cb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 14 Nov 2022 13:34:50 +0100 Subject: [PATCH 09/80] Added new test file for LIS CUDA, GPU now handles boundary (without padding), still float number differences between CPU and GPU --- src/algorithm/LocalIntensityScale.cu | 97 +++++-- src/algorithm/LocalIntensityScaleCuda.h | 3 +- test/CMakeLists.txt | 1 + test/LocalIntensityScaleCudaTest.cpp | 370 ++++++++++++++++++++++++ test/LocalIntensityScaleTest.cpp | 214 -------------- 5 files changed, 445 insertions(+), 240 deletions(-) create mode 100644 test/LocalIntensityScaleCudaTest.cpp diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 5539baef..a0a05c0e 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -37,7 +37,7 @@ * @param z_num */ template -__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect) { // NOTE: Block size in x/z direction must be 1 const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num; const int numOfWorkers = blockDim.y; @@ -53,20 +53,39 @@ __global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_ while(workerOffset < y_num) { if (!waitForNextLoop) v = image[workersOffset + workerOffset]; bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset); + + // Check if current value is one of the mirrored elements (boundary condition) + int numberOfMirrorLeft = offset - workerOffset; + int numberOfMirrorRight = workerOffset + offset - (y_num - 1); + if (boundaryReflect) { + if (numberOfMirrorLeft > 0 && workerOffset >= 1 && workerOffset <= numberOfMirrorLeft) {sum += v; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && workerOffset < (y_num - 1) && workerOffset >= (y_num - 1 - numberOfMirrorRight)) {sum += v; ++countNumOfSumElements;} + } for (int off = 1; off <= offset; ++off) { T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y); T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y); // LHS boundary check + don't add previous values if they were added in a previous loop execution if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;} + // RHS boundary check + don't read next values since they are not read yet - if (!waitForNextValues && workerOffset + off < y_num) {sum += nextElement; ++countNumOfSumElements;} + if (!waitForNextValues && (workerOffset + off) < y_num) {sum += nextElement; ++countNumOfSumElements;} + + // boundary condition (mirroring) + if (boundaryReflect) { + int element = workerOffset + off; + if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += nextElement; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += nextElement; ++countNumOfSumElements;} + element = workerOffset - off; + if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += prevElement; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += prevElement; ++countNumOfSumElements;} + } } waitForNextLoop = waitForNextValues; if (!waitForNextLoop) { sum += v; image[workersOffset + workerOffset] = sum / countNumOfSumElements; - // workere is done with current element - move to next one + // worker is done with current element - move to next one sum = 0; countNumOfSumElements = 1; workerOffset += numOfWorkers; @@ -93,7 +112,7 @@ constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is * read/write operations for given element. */ template -__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num * x_num; const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -113,13 +132,19 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + currElementOffset -= nextElementOffset; + --count; + if (boundaryReflect) { + count = divisor; + } // Pointer in circular buffer int beginPtr = offset; @@ -147,9 +172,17 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; + if (!boundaryReflect) count = count - 1; sum -= data[beginPtr][workerIdx]; + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -173,7 +206,7 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ * read/write operations for given element. */ template -__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // *.z is 'x' const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -193,13 +226,19 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + currElementOffset -= nextElementOffset; + --count; + if (boundaryReflect) { + count = divisor; + } // Pointer in circular buffer int beginPtr = offset; @@ -227,9 +266,17 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; + if (!boundaryReflect) count = count - 1; sum -= data[beginPtr][workerIdx]; + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -238,48 +285,48 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ } template -void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z); - meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (z_num + threadsPerBlock.z - 1) / threadsPerBlock.z); // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (x_num + threadsPerBlock.x - 1) / threadsPerBlock.x); // intentionally here for better memory readings // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream) { +void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { if (flags & MEAN_Y_DIR) { - runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream); + runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } if (flags & MEAN_X_DIR) { - runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream); + runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } if (flags & MEAN_Z_DIR) { - runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream); + runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } } @@ -347,9 +394,9 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // --------- CUDA ---------------- runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream); + runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream); + runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); } @@ -360,17 +407,17 @@ template void runLocalIntensityScalePipeline(const PixelData // =================================================== TEST helpers // TODO: should be moved somewhere template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags) { +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); timer.start_timer("GpuDeviceTimeFull"); - runMean(cudaImage.get(), image, offset, offset, offset, flags, 0); + runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); timer.stop_timer(); } // explicit instantiation of handled types -template void calcMean(PixelData&, int, TypeOfMeanFlags); -template void calcMean(PixelData&, int, TypeOfMeanFlags); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); template diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h index a635a156..135e5927 100644 --- a/src/algorithm/LocalIntensityScaleCuda.h +++ b/src/algorithm/LocalIntensityScaleCuda.h @@ -15,8 +15,9 @@ constexpr TypeOfMeanFlags MEAN_X_DIR = 0x02; constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04; constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR; +// TODO: remember to revert by default boundaryReflect=true (or check with CPU code what is current 'default'). template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR); +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ba468743..2918f2c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ buildTarget(testPullingScheme PullingSchemeTest.cpp) if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) + buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp new file mode 100644 index 00000000..6e2b722b --- /dev/null +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -0,0 +1,370 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "TestTools.hpp" + + +namespace { + +#ifdef APR_USE_CUDA + + // ------------------------------------------------------------------------ + // TODO: REMOVE IT after dev. + // ------------------------------------------------------------------------ + TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { + int y_num = 2; + int x_num = 3; + int z_num = 2; + PixelData m(y_num, x_num, z_num, 0); + PixelData m2(y_num, x_num, z_num, 0); + PixelData m3(y_num, x_num, z_num,0); + float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9 ,10, 11, 12}; + + initFromZYXarray(m, dataIn); + initFromZYXarray(m2, dataIn); + initFromZYXarray(m3, dataIn); + LocalIntensityScale lis; + int off = 0; + lis.calc_sat_mean_x(m, off); + m.printMesh(1); + calcMean(m3, off, MEAN_X_DIR); + m3.printMesh(1); +// lis.calc_sat_mean_y(m2, off); +// m2.printMesh(1); + + + compareMeshes(m3, m, 0.00000001); + } + // ------------------------------------------------------------------------ + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_X_DIR) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(22, 33, 22, 255); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.001), 0); + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { + APRTimer timer(true); + using ImgType = float; + PixelData m = getRandInitializedMesh(22, 33, 22, 255); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } + } + + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { + APRTimer timer(true); + PixelData m(4, 4, 1, 0); + float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 1; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + std::cout << "\n\n"; + for (int offset = 1; offset < 2; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_X_DIR) { + APRTimer timer(true); + //PixelData m(1, 13, 1, 0); + //float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; + //initFromZYXarray(m, dataIn); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, 0, offset * boundary, 0); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000001), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { + APRTimer timer(true); + PixelData m(1, 1, 13, 0); + float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, 0, 0, offset * boundary); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } + } + } + + + // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // TODO: See what these tests are doing and fix/change/remove them! + + TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { + { // OFFSET=0 + + PixelData m(8, 1, 1, 0); + float dataIn[] = {3,6,9,12,15,18,21,24}; + float expect[] = {3,6,9,12,15,18,21,24}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 0, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + } + { // OFFSET=1 + + PixelData m(8, 1, 1, 0); + float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; + float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 1, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + } + { // OFFSET=2 (+symmetricity check) + + PixelData m(8, 1, 1, 0); + float dataIn[] = {3,6,9,12,15,18,21,24}; + float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 2, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + + // check if data in opposite order gives same result + float dataIn2[] = {24,21,18,15,12,9,6,3}; + float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; + + initFromZYXarray(m, dataIn2); + + calcMean(m, 2, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect2, 0.05)); + } + } + + + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(33, 31, 13); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset); + lis.calc_sat_mean_x(mCpu, offset); + lis.calc_sat_mean_z(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); + } + } + + //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. + +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(33, 31, 13); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean ALL-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// lis.calc_sat_mean_x(mCpu, offset); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean ALL-DIR"); +// calcMean(mGpu, offset); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); +// } +// } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + + LocalIntensityScale localIntensityScale; + + localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS ALL-DIR"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + +#endif // APR_USE_CUDA +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp index a9f1558b..e8b194d3 100644 --- a/test/LocalIntensityScaleTest.cpp +++ b/test/LocalIntensityScaleTest.cpp @@ -5,9 +5,6 @@ #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/LocalIntensityScale.hpp" -#include "algorithm/LocalIntensityScaleCuda.h" -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" @@ -168,217 +165,6 @@ namespace { } } - -// ============================================================================ -// ==================== CUDA IMPL TESTS ============================= -// ============================================================================ - -#ifdef APR_USE_CUDA - - TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { - { // OFFSET=0 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {3,6,9,12,15,18,21,24}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 0, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=1 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; - float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 1, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=2 (+symmetricity check) - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - - // check if data in opposite order gives same result - float dataIn2[] = {24,21,18,15,12,9,6,3}; - float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; - - initFromZYXarray(m, dataIn2); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect2, 0.05)); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, 1GPU_VS_CPU_X_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { - APRTimer timer(true); - using ImgType = float; - PixelData m = getRandInitializedMesh(310, 330, 13, 255); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean ALL-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - lis.calc_sat_mean_x(mCpu, offset); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean ALL-DIR"); - calcMean(mGpu, offset); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. - -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); -// } -// } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(310, 330, 13, 25); - - APRParameters params; - params.sigma_th = 1; - params.sigma_th_max = 2; - params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. - - // Run on CPU - PixelData mCpu(m, true); - PixelData mCpuTemp(m, false); - timer.start_timer("CPU LIS FULL"); - - LocalIntensityScale localIntensityScale; - - localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - PixelData mGpuTemp(m, false); - timer.start_timer("GPU LIS ALL-DIR"); - getLocalIntensityScale(mGpu, mGpuTemp, params); - timer.stop_timer(); - - // Compare results - //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { From 570ab20ecc4fc30cf4cab0611350da5d9fc379d2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 31 Jan 2023 15:36:14 +0100 Subject: [PATCH 10/80] Local Intensity Scale (LIS) not works in X-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 16 +- src/algorithm/LocalIntensityScale.hpp | 130 +++++++--- src/data_structures/Mesh/PixelData.hpp | 15 +- test/LocalIntensityScaleCudaTest.cpp | 329 +++++++++++++++++++------ test/LocalIntensityScaleTest.cpp | 24 +- test/MeshDataTest.cpp | 10 + test/TestTools.hpp | 29 ++- 7 files changed, 418 insertions(+), 135 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index a0a05c0e..3673b406 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -175,8 +175,16 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; while (saveElementOffset < currElementOffset) { - if (!boundaryReflect) count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (x_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } if (boundaryReflect) { sum += data[boundaryPtr][workerIdx]; @@ -410,9 +418,9 @@ template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); - timer.start_timer("GpuDeviceTimeFull"); +// timer.start_timer("GpuDeviceTimeFull"); runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); - timer.stop_timer(); +// timer.stop_timer(); } // explicit instantiation of handled types diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 3d5942c2..bee7f303 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -156,7 +156,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input, const size_t offset); template - void calc_sat_mean_x(PixelData &input, const size_t offset); + void calc_sat_mean_x(PixelData &input, const size_t offset, bool boundaryReflect = false); template void calc_sat_mean_y(PixelData &input, const size_t offset); @@ -367,63 +367,119 @@ inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size } template -inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif + auto &mesh = input.mesh; + size_t dimLen = x_num; + + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif for(size_t j = 0; j < z_num; j++) { size_t jxnumynum = j * x_num * y_num; - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[jxnumynum + k]; - } + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; // offset used to finish RHS boundary - for(size_t i = 1; i < 2 * offset + 1; i++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[i*y_num + k] = input.mesh[jxnumynum + i*y_num + k] + temp_vec[(i-1)*y_num + k]; + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while(count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t i = 0; i < offset + 1; i++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[jxnumynum + i * y_num + k] = (temp_vec[(i + offset) * y_num + k]) / (i + offset + 1); - } + currElementOffset -= nextElementOffset; + --count; + + if (boundaryReflect) { + count = divisor; } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t i = offset + 1; i < x_num - offset; i++){ - // the current cumsum - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum + // Pointer in circular buffer + int beginPtr = offset; - for(size_t k = 0; k < y_num; k++) { - float temp = input.mesh[jxnumynum + (i + offset)*y_num + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[jxnumynum + i*y_num + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // main loop going through all elements in range [0, x_num-offset) + for (size_t x = 0; x < dimLen - offset; ++x) { + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; + + // Update sum to cover [-offset, offset] of currently processed element + sum[k] += v; + if (count >= divisor || x == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; + + // Save new element + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // move pointer in circular buffer and number of active elements hold there + beginPtr = (beginPtr + 1) % divisor; + count = std::min(count + 1, divisor); + + for (size_t k = 0; k < y_num; ++k) { + // save currently processed element + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + // Move to next elements + currElementOffset += nextElementOffset; + saveElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t i = x_num - offset; i < x_num; i++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[jxnumynum + i*y_num + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (x_num - i + offset); + // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; } - current_index = (current_index + 1) % (2*offset + 1); + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } + + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 13264ec4..68de3b00 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -34,7 +34,7 @@ struct PixelDataDim { size_t x; size_t z; - PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} + constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } @@ -436,6 +436,19 @@ public : */ PixelData(int aSizeOfY, int aSizeOfX, int aSizeOfZ, T aInitVal) { initWithValue(aSizeOfY, aSizeOfX, aSizeOfZ, aInitVal); } + /** + * Constructor - initialize initial size of mesh to provided values + * @param aDims - PixelDataDim with length of each dimension + */ + PixelData(PixelDataDim aDims) { init(aDims.y, aDims.x, aDims.z); } + + /** + * Constructor - creates mesh with provided dimentions initialized to aInitVal + * @param aDims - PixelDataDim with length of each dimension + * @param aInitVal - initial value of all elements + */ + PixelData(PixelDataDim aDims, T aInitVal) { initWithValue(aDims.y, aDims.x, aDims.z, aInitVal); } + /** * Move constructor * @param aObj mesh to be moved diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index 6e2b722b..b0e084ca 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -10,33 +10,260 @@ namespace { #ifdef APR_USE_CUDA + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_X_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 5, 1}; + float expectedData[2][5][dim.x] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{63, 65, 96}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { + //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + // ------------------------------------------------------------------------ - // TODO: REMOVE IT after dev. + // Below tests are not yet fixed. // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { - int y_num = 2; - int x_num = 3; - int z_num = 2; - PixelData m(y_num, x_num, z_num, 0); - PixelData m2(y_num, x_num, z_num, 0); - PixelData m3(y_num, x_num, z_num,0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9 ,10, 11, 12}; + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 5, 1}; + float expectedData[2][5][dim.x] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; initFromZYXarray(m, dataIn); - initFromZYXarray(m2, dataIn); - initFromZYXarray(m3, dataIn); + LocalIntensityScale lis; - int off = 0; - lis.calc_sat_mean_x(m, off); - m.printMesh(1); - calcMean(m3, off, MEAN_X_DIR); - m3.printMesh(1); -// lis.calc_sat_mean_y(m2, off); -// m2.printMesh(1); + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); - compareMeshes(m3, m, 0.00000001); + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{63, 65, 96}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { + //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU +// PixelData mCpu2; +// PixelData mCpuPadded; +// timer.start_timer("CPU old mean X-DIR"); +// paddPixels(m, mCpuPadded, 0, offset * boundary, 0); +// lis.calc_sat_mean_x_orig(mCpuPadded, offset); +// unpaddPixels(mCpuPadded, mCpu2, dim.y, dim.x, dim.z); +// timer.stop_timer(); + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + +// TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { +// int y_num = 1; +// int x_num = 5; +// int z_num = 1; +//#if 1 +// PixelData m(y_num, x_num, z_num, 0); +// PixelData m2(y_num, x_num, z_num, 0); +// PixelData m3(y_num, x_num, z_num, 0); +// PixelData m4(y_num, x_num, z_num, 0); +// float dataIn[] = {1, 2, 3, 4, 5}; +//// float dataIn[] = {75.4539260864, 42.5445404053, 0.00003, 4, 0.00005, 6, 0.00007, 8, 0.00009, 10, 0.000011, 12}; +// +// initFromZYXarray(m, dataIn); +// initFromZYXarray(m2, dataIn); +// initFromZYXarray(m3, dataIn); +// initFromZYXarray(m4, dataIn); +//#else +// PixelData m = getRandInitializedMesh(y_num, x_num, z_num, 200, 0); +// PixelData m2(m, true); +// PixelData m3(m, true); +// PixelData m4(m, true); +//#endif +// +// LocalIntensityScale lis; +// +// int off = 4; +// +// std::cout << "INP:"; m.printMesh(1); +// +// bool boundary = true; +// +// APRTimer timer(true); +// calcMean(m3, off, MEAN_X_DIR, boundary); +// timer.start_timer("new"); +// lis.calc_sat_mean_x(m2, off, boundary); +// timer.stop_timer(); +// +// timer.start_timer("old"); +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, 0, off, 0); +// lis.calc_sat_mean_x_orig(mCpuPadded, off); +// unpaddPixels(mCpuPadded, m4, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// std::cout << "CPU: "; m2.printMesh(1); +// std::cout << "GPU: "; m3.printMesh(1); +// std::cout << "CPU old: "; m4.printMesh(1); +// +// std::cout << "GPU vs NEW\n"; +// compareMeshes(m3, m2, 0.00000001, 3); +// std::cout << "OLD vs GPU\n"; +// compareMeshes(m4, m3, 0.00000001, 3); +// std::cout << "OLD vs NEW\n"; +// EXPECT_EQ(compareMeshes(m4, m2, 0.00000001, 3), 0); +// } // ------------------------------------------------------------------------ TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { @@ -65,31 +292,7 @@ namespace { } } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_X_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(22, 33, 22, 255); - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.001), 0); - } - } TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { APRTimer timer(true); @@ -154,45 +357,15 @@ namespace { } } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_X_DIR) { - APRTimer timer(true); - //PixelData m(1, 13, 1, 0); - //float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; - //initFromZYXarray(m, dataIn); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - LocalIntensityScale lis; - - for (int boundary = 0; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, 0, offset * boundary, 0); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000001), 0); - } - } - } TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { APRTimer timer(true); - PixelData m(1, 1, 13, 0); - float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; - initFromZYXarray(m, dataIn); +// PixelData m(1, 1, 13, 0); +// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +// initFromZYXarray(m, dataIn); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + LocalIntensityScale lis; diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp index e8b194d3..09a6466b 100644 --- a/test/LocalIntensityScaleTest.cpp +++ b/test/LocalIntensityScaleTest.cpp @@ -21,7 +21,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -34,7 +34,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -47,7 +47,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -57,7 +57,7 @@ namespace { lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -73,7 +73,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -86,7 +86,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -99,7 +99,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -109,7 +109,7 @@ namespace { lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -125,7 +125,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -138,7 +138,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -151,7 +151,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -161,7 +161,7 @@ namespace { lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index f9c9bf4b..a3c4bec6 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -338,6 +338,16 @@ namespace { ASSERT_EQ(md.mesh.size(), 100*200*300); } + // size provided + { + PixelDataDim dim(100, 200, 300); + PixelData md(dim); + ASSERT_EQ(md.x_num, 200); + ASSERT_EQ(md.y_num, 100); + ASSERT_EQ(md.z_num, 300); + ASSERT_EQ(md.mesh.size(), 100*200*300); + } + // mesh provided { // generate some data diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 0896eea5..d0211f6f 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -68,16 +68,26 @@ inline bool initFromZYXarray(PixelData &mesh, const float *data) { */ template inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { + if (expected.getDimension() != tested.getDimension()) { + std::stringstream errMsg; + errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension(); + throw std::runtime_error(errMsg.str()); + } + int cnt = 0; + double maxErrorFound = 0; + for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) { + auto diff = std::abs(expected.mesh[i] - tested.mesh[i]); + if (diff > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } + if (diff > maxErrorFound) maxErrorFound = diff; } - if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; + if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " maxErrorFound = " << maxErrorFound << std::endl; return cnt; } @@ -112,7 +122,6 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp return cnt; } - /** * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y @@ -120,6 +129,7 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp * @param z * @param multiplier * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 * @return */ template @@ -139,6 +149,19 @@ inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier return m; } +/** + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset + * @param dim - dimension of generated mesh + * @param multiplier + * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 + * @return + */ +template +inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { + return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); +} + struct TestBenchStats{ double inf_norm=0; From 17e5d8edace7ce253cfae82db13c82bff31b9f74 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 1 Feb 2023 15:21:57 +0100 Subject: [PATCH 11/80] Local Intensity Scale (LIS) now works in Z-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 12 ++- src/algorithm/LocalIntensityScale.hpp | 137 ++++++++++++++++++-------- test/LocalIntensityScaleCudaTest.cpp | 50 ++++------ 3 files changed, 123 insertions(+), 76 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 3673b406..11e005fa 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -277,8 +277,16 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; while (saveElementOffset < currElementOffset) { - if (!boundaryReflect) count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (z_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } if (boundaryReflect) { sum += data[boundaryPtr][workerIdx]; diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index bee7f303..4e3213f0 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -153,7 +153,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input_image, PixelData &var); template - void calc_sat_mean_z(PixelData &input, const size_t offset); + void calc_sat_mean_z(PixelData &input, const size_t offset, bool boundaryReflect = false); template void calc_sat_mean_x(PixelData &input, const size_t offset, bool boundaryReflect = false); @@ -456,7 +456,7 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size // processed dimension: // dim elements: xxxxxx // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) - bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) > offset; + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; if (removeElementFromFilter) { if (!boundaryReflect) count = count - 1; @@ -484,69 +484,120 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size } template -inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input,const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); - size_t xnumynum = x_num * y_num; + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif - for(size_t i = 0; i < x_num; i++) { + auto &mesh = input.mesh; + size_t dimLen = z_num; - size_t iynum = i * y_num; + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } - //prefetching - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[iynum + k]; - } +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif + for (size_t j = 0; j < x_num; j++) { + size_t jxnumynum = j * y_num; - for(size_t j = 1; j < 2 * offset + 1; j++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[j*y_num + k] = input.mesh[j * xnumynum + iynum + k] + temp_vec[(j-1)*y_num + k]; + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = x_num; + size_t saveElementOffset = 0; // offset used to finish RHS boundary + + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while(count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t j = 0; j < offset + 1; j++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[j * xnumynum + iynum + k] = (temp_vec[(j + offset)*y_num + k]) / (j + offset + 1); - } + currElementOffset -= nextElementOffset; + --count; + + if (boundaryReflect) { + count = divisor; } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t j = offset + 1; j < z_num - offset; j++){ + // Pointer in circular buffer + int beginPtr = offset; - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum + // main loop going through all elements in range [0, x_num-offset) + for (size_t z = 0; z < dimLen - offset; ++z) { + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; - for(size_t k = 0; k < y_num; k++) { - // the current cumsum - float temp = input.mesh[(j + offset) * xnumynum + iynum + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[j * xnumynum + iynum + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // Update sum to cover [-offset, offset] of currently processed element + sum[k] += v; + if (count >= divisor || z == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; + + // Save new element + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // move pointer in circular buffer and number of active elements hold there + beginPtr = (beginPtr + 1) % divisor; + count = std::min(count + 1, divisor); + + for (size_t k = 0; k < y_num; ++k) { + // save currently processed element + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + // Move to next elements + currElementOffset += nextElementOffset; + saveElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t j = z_num - offset; j < z_num; j++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[j * xnumynum + iynum + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (z_num - j + offset); + // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; } - current_index = (current_index + 1) % (2*offset + 1); + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } + + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } -#endif //PARTPLAY_LOCAL_INTENSITY_SCALE_HPP +#endif diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index b0e084ca..a5c52b63 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -18,7 +18,7 @@ namespace { {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 - {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 }, { // with boundary values @@ -30,7 +30,6 @@ namespace { } }; - APRTimer timer(false); // set to true to see timings PixelData m(dim); @@ -72,7 +71,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{63, 65, 96}; + constexpr PixelDataDim const dim{49, 53, 51}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -102,20 +101,15 @@ namespace { } } - - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) { // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! - constexpr PixelDataDim const dim{1, 5, 1}; - float expectedData[2][5][dim.x] = { + constexpr PixelDataDim const dim{1, 1, 5}; + float expectedData[2][5][dim.z] = { { // with no boundary values {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 - {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 }, { // with boundary values @@ -127,7 +121,6 @@ namespace { } }; - APRTimer timer(false); // set to true to see timings PixelData m(dim); @@ -140,18 +133,18 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 4; ++offset) { -// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; +// std::cout << "------------------ OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; // Run on CPU PixelData mCpu(m, true); timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); timer.stop_timer(); // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); timer.stop_timer(); // Compare results @@ -169,7 +162,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{63, 65, 96}; + constexpr PixelDataDim const dim{49,51,53}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -178,37 +171,32 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 6; ++offset) { - //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; - - // Run on CPU -// PixelData mCpu2; -// PixelData mCpuPadded; -// timer.start_timer("CPU old mean X-DIR"); -// paddPixels(m, mCpuPadded, 0, offset * boundary, 0); -// lis.calc_sat_mean_x_orig(mCpuPadded, offset); -// unpaddPixels(mCpuPadded, mCpu2, dim.y, dim.x, dim.z); -// timer.stop_timer(); +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; PixelData mCpu; mCpu.init(m); mCpu.copyFromMesh(m); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); timer.stop_timer(); // Run on GPU PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); timer.stop_timer(); - // Compare results EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } + // ------------------------------------------------------------------------ + // Below tests are not yet fixed. + // ------------------------------------------------------------------------ + + // TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { // int y_num = 1; // int x_num = 5; From 5ad9865239a3d924feb88d0b0cf7fafab88a15fa Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Feb 2023 15:49:26 +0100 Subject: [PATCH 12/80] Updated compareMeshes to show maximum error found --- test/TestTools.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/test/TestTools.hpp b/test/TestTools.hpp index d0211f6f..4ec15afe 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -46,7 +46,7 @@ inline bool compare(PixelData &mesh, const float *data, const float epsilon) } template -inline bool initFromZYXarray(PixelData &mesh, const float *data) { +inline bool initFromZYXarray(PixelData &mesh, T *data) { size_t dataIdx = 0; for (int z = 0; z < mesh.z_num; ++z) { for (int y = 0; y < mesh.y_num; ++y) { @@ -76,18 +76,33 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste int cnt = 0; double maxErrorFound = 0; + T maxErrorExpectedValue = 0; + T maxErrorTestedValue = 0; + std::string maxErrorIdx = ""; for (size_t i = 0; i < expected.mesh.size(); ++i) { auto diff = std::abs(expected.mesh[i] - tested.mesh[i]); if (diff > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " + << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] + << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } - if (diff > maxErrorFound) maxErrorFound = diff; + if (diff > maxErrorFound) { + maxErrorFound = diff; + maxErrorExpectedValue = expected.mesh[i]; + maxErrorTestedValue = tested.mesh[i]; + maxErrorIdx = tested.getStrIndex(i); + } + } + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() + << ", maxErrorFound = " << maxErrorFound << " at IDX: " << maxErrorIdx << " " + << maxErrorExpectedValue << " vs " << maxErrorTestedValue + << "(" << (100*(long double)maxErrorFound/(long double)maxErrorExpectedValue) << "%)"<& input, const size #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) #endif - for(size_t j = 0; j < z_num; j++) { + for (size_t j = 0; j < z_num; j++) { size_t jxnumynum = j * x_num * y_num; size_t count = 0; // counts number of active elements in filter @@ -395,8 +395,12 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size size_t nextElementOffset = 1; size_t saveElementOffset = 0; // offset used to finish RHS boundary + // Clear buffers so they can be reused in next 'z_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS - while(count <= offset) { + while (count <= offset) { for (size_t k = 0; k < y_num; ++k) { auto v = mesh[jxnumynum + currElementOffset * y_num + k]; sum[k] += v; @@ -408,42 +412,45 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size ++count; } - currElementOffset -= nextElementOffset; - --count; - if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = x_num - 1 - offset; + for (size_t x = 0; x <= lastElement; ++x) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (x == lastElement) break; - // main loop going through all elements in range [0, x_num-offset) - for (size_t x = 0; x < dimLen - offset; ++x) { for (size_t k = 0; k < y_num; ++k) { // Read new element T v = mesh[jxnumynum + currElementOffset * y_num + k]; // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; sum[k] += v; - if (count >= divisor || x == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; - // Save new element + // Store new element in circularBuffer circularBuffer[beginPtr * y_num + k] = v; } - // move pointer in circular buffer and number of active elements hold there - beginPtr = (beginPtr + 1) % divisor; + // Move to next elements to read and in circular buffer count = std::min(count + 1, divisor); - - for (size_t k = 0; k < y_num; ++k) { - // save currently processed element - mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; - } - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value @@ -478,8 +485,6 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; } - - std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index a5c52b63..d66810c9 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -80,7 +80,7 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 6; ++offset) { - //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; +// std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; PixelData mCpu; mCpu.init(m); @@ -96,7 +96,94 @@ namespace { timer.stop_timer(); // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); // Expect exactly same results + } + } + } + + /** + * Generate input and expected output using easy brute force approach. + * When comparing vs CPU or GPU outputs there is small error expected since little difference in order of float + * operations. + * @tparam T - type of generated data + * @param len - length + * @param offset - offset for which expected output should be calculated + * @param boundary - use boundary? + * @param useRandomNumbers - use random numbers or if false then index numbers in buffers [1..len] + * @return tuple of [input, expectedOutput] + */ + template + auto generateInputAndExpected(int len, int offset, bool boundary, bool useRandomNumbers) { + std::vector input(len); + std::vector expected(len); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 10.0); + + // Feel input and calculate expected data + for (int i = 0; i < len; ++i) input[i] = useRandomNumbers ? dist(mt) : i + 1; + + for (int i = 0; i < len; ++i) { + int count = 0; + T sum = 0; + for (int x = i - offset; x <= i + offset; ++x) { + int currIdx = x; + if (boundary) { + currIdx = abs(x); + if (currIdx > len - 1) currIdx = (len - 1) - (currIdx - (len - 1)); + } + + if (currIdx < 0 || currIdx >= len) continue; + + sum += input[currIdx]; + count++; + } + expected[i] = sum / count; + } + return std::make_tuple(input, expected); + } + + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_RANDOM_VALUES_X_DIR) { + // Input params + using T = uint16_t; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, len, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, len, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU X-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); + timer.stop_timer(); + + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } } } } @@ -254,213 +341,213 @@ namespace { // } // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { - APRTimer timer(true); - using ImgType = float; - PixelData m = getRandInitializedMesh(22, 33, 22, 255); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); - } - } - - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { - APRTimer timer(true); - PixelData m(4, 4, 1, 0); - float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; - initFromZYXarray(m, dataIn); - - LocalIntensityScale lis; - - for (int boundary = 1; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - std::cout << "\n\n"; - for (int offset = 1; offset < 2; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// +// std::cout << " ============================== " << offset << std::endl; +// +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean Y-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Y-DIR"); +// calcMean(mGpu, offset, MEAN_Y_DIR); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); +// } +// } - timer.stop_timer(); - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); - } - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { +// APRTimer timer(true); +// using ImgType = float; +// PixelData m = getRandInitializedMesh(22, 33, 22, 255); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// +// std::cout << " ============================== " << offset << std::endl; +// +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean Z-DIR"); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Z-DIR"); +// calcMean(mGpu, offset, MEAN_Z_DIR); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); +// } +// } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { - APRTimer timer(true); -// PixelData m(1, 1, 13, 0); -// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { +// APRTimer timer(true); +// PixelData m(4, 4, 1, 0); +// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; // initFromZYXarray(m, dataIn); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - +// +// LocalIntensityScale lis; +// +// for (int boundary = 1; boundary <= 1; ++ boundary) { +// // boundary = 0 there is no reflected boundary +// // boudnary = 1 there is boundary reflect +// std::cout << "\n\n"; +// for (int offset = 1; offset < 2; ++offset) { +// // Run on CPU +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); +// timer.start_timer("CPU mean Y-DIR"); +// lis.calc_sat_mean_y(mCpuPadded, offset); +// PixelData mCpu; +// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Y-DIR"); +// calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); +// +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); +// } +// } +// } - LocalIntensityScale lis; - for (int boundary = 0; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, 0, 0, offset * boundary); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); - } - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { +// APRTimer timer(true); +//// PixelData m(1, 1, 13, 0); +//// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +//// initFromZYXarray(m, dataIn); +// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// +// +// LocalIntensityScale lis; +// +// for (int boundary = 0; boundary <= 1; ++ boundary) { +// // boundary = 0 there is no reflected boundary +// // boudnary = 1 there is boundary reflect +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, 0, 0, offset * boundary); +// timer.start_timer("CPU mean Z-DIR"); +// lis.calc_sat_mean_z(mCpuPadded, offset); +// PixelData mCpu; +// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Z-DIR"); +// calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); +// } +// } +// } // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! // TODO: See what these tests are doing and fix/change/remove them! - TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { - { // OFFSET=0 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {3,6,9,12,15,18,21,24}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 0, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=1 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; - float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 1, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=2 (+symmetricity check) - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - - // check if data in opposite order gives same result - float dataIn2[] = {24,21,18,15,12,9,6,3}; - float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; - - initFromZYXarray(m, dataIn2); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect2, 0.05)); - } - } - +// TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { +// { // OFFSET=0 +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {3,6,9,12,15,18,21,24}; +// float expect[] = {3,6,9,12,15,18,21,24}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 0, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// } +// { // OFFSET=1 +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; +// float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 1, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// } +// { // OFFSET=2 (+symmetricity check) +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {3,6,9,12,15,18,21,24}; +// float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 2, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// +// // check if data in opposite order gives same result +// float dataIn2[] = {24,21,18,15,12,9,6,3}; +// float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; +// +// initFromZYXarray(m, dataIn2); +// +// calcMean(m, 2, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect2, 0.05)); +// } +// } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean ALL-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - lis.calc_sat_mean_x(mCpu, offset); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean ALL-DIR"); - calcMean(mGpu, offset); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(33, 31, 13); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean ALL-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// lis.calc_sat_mean_x(mCpu, offset); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean ALL-DIR"); +// calcMean(mGpu, offset); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); +// } +// } //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. @@ -489,36 +576,36 @@ namespace { // } // } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - - APRParameters params; - params.sigma_th = 1; - params.sigma_th_max = 2; - params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. - - // Run on CPU - PixelData mCpu(m, true); - PixelData mCpuTemp(m, false); - timer.start_timer("CPU LIS FULL"); - - LocalIntensityScale localIntensityScale; - - localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - PixelData mGpuTemp(m, false); - timer.start_timer("GPU LIS ALL-DIR"); - getLocalIntensityScale(mGpu, mGpuTemp, params); - timer.stop_timer(); - - // Compare results - //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// +// APRParameters params; +// params.sigma_th = 1; +// params.sigma_th_max = 2; +// params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. +// +// // Run on CPU +// PixelData mCpu(m, true); +// PixelData mCpuTemp(m, false); +// timer.start_timer("CPU LIS FULL"); +// +// LocalIntensityScale localIntensityScale; +// +// localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// PixelData mGpuTemp(m, false); +// timer.start_timer("GPU LIS ALL-DIR"); +// getLocalIntensityScale(mGpu, mGpuTemp, params); +// timer.stop_timer(); +// +// // Compare results +// //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); +// } #endif // APR_USE_CUDA From 521d8264634085b5f87d70da7bba86cae5693e18 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 24 Feb 2023 13:42:28 +0100 Subject: [PATCH 14/80] LIS in Z-dir redesigned so code is clearer and faster. Also new test added. --- src/algorithm/LocalIntensityScale.cu | 35 ++++++++++------- src/algorithm/LocalIntensityScale.hpp | 45 +++++++++++---------- test/LocalIntensityScaleCudaTest.cpp | 56 +++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 37 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 11de9de3..06e2996a 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -249,35 +249,42 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ currElementOffset += nextElementOffset; ++count; } - currElementOffset -= nextElementOffset; - --count; + if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = z_num - 1 - offset; + for (int z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + image[workerOffset + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (z == lastElement) break; - // main loop going through all elements in range [0, z_num-offset) - for (int z = 0; z < z_num - offset; ++z) { // Read new element T v = image[workerOffset + currElementOffset]; // Update sum to cover [-offset, offset] of currently processed element - sum += v; sum -= data[beginPtr][workerIdx]; + sum += v; - // Save and move pointer + // Store new element in circularBuffer data[beginPtr][workerIdx] = v; - beginPtr = (beginPtr + 1) % divisor; - // Update count and save currently processed element + // Move to next elements to read and in circular buffer count = min(count + 1, divisor); - image[workerOffset + saveElementOffset] = sum / count; - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // Handle last #offset elements on RHS diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 28376e84..30cc4be6 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -517,6 +517,10 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size size_t nextElementOffset = x_num; size_t saveElementOffset = 0; // offset used to finish RHS boundary + // Clear buffers so they can be reused in next 'x_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS while(count <= offset) { for (size_t k = 0; k < y_num; ++k) { @@ -530,42 +534,45 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size ++count; } - currElementOffset -= nextElementOffset; - --count; - if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = z_num - 1 - offset; + for (size_t z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (z == lastElement) break; - // main loop going through all elements in range [0, x_num-offset) - for (size_t z = 0; z < dimLen - offset; ++z) { for (size_t k = 0; k < y_num; ++k) { // Read new element T v = mesh[jxnumynum + currElementOffset * y_num + k]; // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; sum[k] += v; - if (count >= divisor || z == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; // Save new element circularBuffer[beginPtr * y_num + k] = v; } - // move pointer in circular buffer and number of active elements hold there - beginPtr = (beginPtr + 1) % divisor; + // Move to next elements to read and in circular buffer count = std::min(count + 1, divisor); - - for (size_t k = 0; k < y_num; ++k) { - // save currently processed element - mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; - } - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value @@ -600,8 +607,6 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; } - - std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d66810c9..d15f6561 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -144,9 +144,9 @@ namespace { return std::make_tuple(input, expected); } - TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_RANDOM_VALUES_X_DIR) { + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_X_DIR) { // Input params - using T = uint16_t; + using T = float; for (int b = 0; b <= 1; b++) { for (int len = 5; len <= 45; len += 20) { @@ -179,6 +179,8 @@ namespace { calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); timer.stop_timer(); + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; @@ -249,7 +251,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{49,51,53}; + constexpr PixelDataDim const dim{49, 51, 53}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -279,6 +281,54 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Z_DIR) { + // Input params + using T = float; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, 1, len, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, 1, len, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Z-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (hasBoundary > 0)); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) + << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) + << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + // ------------------------------------------------------------------------ // Below tests are not yet fixed. // ------------------------------------------------------------------------ From b297adf6b982aedbba44cc54859154e78b1f970c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 13 Mar 2023 15:30:34 +0100 Subject: [PATCH 15/80] Local Intensity Scale (LIS) now works in Y-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 158 ++++++--- src/algorithm/LocalIntensityScale.hpp | 121 ++++--- test/LocalIntensityScaleCudaTest.cpp | 486 +++++++++++--------------- 3 files changed, 375 insertions(+), 390 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 06e2996a..057e4de2 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -14,21 +14,14 @@ /** + * Calculates mean in Y direction * - * How it works along y-dir (let's suppose offset = 2 and number of workers = 8 for simplicity): - * - * image idx: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 - * - * loop #1 - * workersIdx 0 1 2 3 4 5 6 7 - * loop #2 - * workersIdx 6 7 0 1 2 3 4 5 - * loop #3 - * workersIdx 4 5 6 7 0 1 2 3 - * .............. - * - * so #offset workers must wait in each loop to have next elements to sum - * + * NOTE: This is not optimal implementation but.. correct and more or less fast as previous one. + * The reason for change was to have results exactly same as in CPU side. + * Currently after reading whole y-dir line of data mean calculation is done only by one from all threads in block + * so here is some room for improvements. + * If needed may be optimized in future. The main limitation is size of shared memory needed which + * limits number of CUDA blocks that can run in parallel. * @tparam T * @param image * @param offset @@ -41,59 +34,113 @@ __global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_ // NOTE: Block size in x/z direction must be 1 const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num; const int numOfWorkers = blockDim.y; - const unsigned int active = __activemask(); const int workerIdx = threadIdx.y; + + extern __shared__ char sharedMemChar[]; + T *buffer = (T*) sharedMemChar; + T *data = (T*) &buffer[y_num]; + + // Read whole line of data from y-direction int workerOffset = workerIdx; + while (workerOffset < y_num) { + buffer[workerOffset] = image[workersOffset + workerOffset]; + workerOffset += numOfWorkers; + } + + const int divisor = 2 * offset + 1; + size_t currElementOffset = 0; + size_t saveElementOffset = 0; + size_t nextElementOffset = 1; + + if (workerIdx == 0) { + // clear shared mem + for (int i = offset; i < divisor; ++i) data[i] = 0; + + // saturate cache with #offset elements since it will allow to calculate first element value on LHS + float sum = 0; + int count = 0; + while (count <= offset) { + T v = buffer[currElementOffset]; + sum += v; + data[count] = v; + if (boundaryReflect && count > 0) { + data[2 * offset - count + 1] = v; + sum += v; + } + currElementOffset += nextElementOffset; + ++count; + } - int offsetInTheLoop = 0; - T sum = 0; - T v = 0; - bool waitForNextLoop = false; - int countNumOfSumElements = 1; - while(workerOffset < y_num) { - if (!waitForNextLoop) v = image[workersOffset + workerOffset]; - bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset); - - // Check if current value is one of the mirrored elements (boundary condition) - int numberOfMirrorLeft = offset - workerOffset; - int numberOfMirrorRight = workerOffset + offset - (y_num - 1); if (boundaryReflect) { - if (numberOfMirrorLeft > 0 && workerOffset >= 1 && workerOffset <= numberOfMirrorLeft) {sum += v; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && workerOffset < (y_num - 1) && workerOffset >= (y_num - 1 - numberOfMirrorRight)) {sum += v; ++countNumOfSumElements;} + count += offset; // elements in above loop in range [1, offset] were summed twice } - for (int off = 1; off <= offset; ++off) { - T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y); - T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y); - // LHS boundary check + don't add previous values if they were added in a previous loop execution - if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;} - // RHS boundary check + don't read next values since they are not read yet - if (!waitForNextValues && (workerOffset + off) < y_num) {sum += nextElement; ++countNumOfSumElements;} + // Pointer in circular buffer + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, y_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = y_num - 1 - offset; + for (int y = 0; y <= lastElement; ++y) { + // Calculate and save currently processed element and move to the new one + buffer[saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (y == lastElement) break; + + // Read new element + T v = buffer[currElementOffset]; + + // Update sum to cover [-offset, offset] of currently processed element + sum -= data[beginPtr]; + sum += v; + + // Store new element in circularBuffer + data[beginPtr] = v; + + // Move to next elements to read and in circular buffer + count = min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; + } + + // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2 * offset + 1)) % divisor; + + while (saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (y_num - (currElementOffset - saveElementOffset) / nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr]; + } - // boundary condition (mirroring) if (boundaryReflect) { - int element = workerOffset + off; - if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += nextElement; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += nextElement; ++countNumOfSumElements;} - element = workerOffset - off; - if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += prevElement; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += prevElement; ++countNumOfSumElements;} + sum += data[boundaryPtr]; + boundaryPtr = (boundaryPtr - 1 + (2 * offset + 1)) % divisor; } - } - waitForNextLoop = waitForNextValues; - if (!waitForNextLoop) { - sum += v; - image[workersOffset + workerOffset] = sum / countNumOfSumElements; - // worker is done with current element - move to next one - sum = 0; - countNumOfSumElements = 1; - workerOffset += numOfWorkers; + buffer[saveElementOffset] = sum / count; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - offsetInTheLoop += offset; } -} + // Save whole line of data + workerOffset = workerIdx; + while (workerOffset < y_num) { + image[workersOffset + workerOffset] = buffer[workerOffset]; + workerOffset += numOfWorkers; + } +} constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is no inter-warp communication implemented. /** @@ -320,7 +367,8 @@ void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_ dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z); - meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); + const int sharedMemorySize = sizeof(T) * y_num + (offset * 2 + 1) * sizeof(float); + meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 30cc4be6..3f7fffef 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -159,7 +159,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input, const size_t offset, bool boundaryReflect = false); template - void calc_sat_mean_y(PixelData &input, const size_t offset); + void calc_sat_mean_y(PixelData &input, const size_t offset, bool boundaryReflect = false); void get_window(float &var_rescale, std::vector &var_win, const APRParameters &par); @@ -302,66 +302,91 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< } } -/** - * Calculates a O(1) recursive mean using SAT. - * @tparam T - * @param input - * @param offset - */ template -inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset){ +inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset, bool boundaryReflect) { const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num); - float divisor = 2 * offset + 1; + const size_t divisor = offset + 1 + offset; + + auto &mesh = input.mesh; + const size_t dimLen = y_num; #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) + #pragma omp parallel for default(shared) #endif - for(size_t j = 0; j < z_num; ++j) { - for(size_t i = 0; i < x_num; ++i){ - size_t index = j * x_num*y_num + i * y_num; - - //first pass over and calculate cumsum - float temp = 0; - for (size_t k = 0; k < y_num; ++k) { - temp += input.mesh[index + k]; - temp_vec[k] = temp; + for (size_t j = 0; j < z_num; ++j) { + for (size_t i = 0; i < x_num; ++i) { + size_t index = j * x_num * y_num + i * y_num; + + size_t count = 0; + size_t currElementOffset = 0; + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; + + std::vector circularBuffer(divisor, 0); + T sum = 0; + + while (count <= offset) { + auto v = mesh[index + currElementOffset]; + sum += v; + circularBuffer[count] = v; + if (boundaryReflect && count > 0) { circularBuffer[2 * offset - count + 1] = v; sum += v;} + + currElementOffset += nextElementOffset; + count++; } - //handling boundary conditions (LHS) - for (size_t k = 0; k <= offset; ++k) { - input.mesh[index + k] = 0; - } + if (boundaryReflect) count += offset; - //second pass calculate mean - for (size_t k = offset + 1; k < y_num; ++k) { - input.mesh[index + k] = -temp_vec[k - offset - 1]/divisor; - } + int beginPtr = (offset + 1) % divisor; - //second pass calculate mean - for (size_t k = 0; k < (y_num-offset); ++k) { - input.mesh[index + k] += temp_vec[k + offset]/divisor; - } + const int lastElement = dimLen - 1 - offset; + for (int i = 0; i <= lastElement; ++i) { + mesh[index + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + if (i == lastElement) break; - float counter = 0; - //handling boundary conditions (RHS) - for (size_t k = (y_num - offset); k < (y_num); ++k) { - counter++; - input.mesh[index + k]*= divisor; - input.mesh[index + k]+= temp_vec[y_num-1]; - input.mesh[index + k]*= 1.0/(divisor - counter); + auto v = mesh[index + currElementOffset]; + + sum -= circularBuffer[beginPtr]; + sum += v; + + circularBuffer[beginPtr] = v; + + count = std::min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; } - //handling boundary conditions (LHS), need to rehandle the boundary - for (size_t k = 1; k <= offset; ++k) { - input.mesh[index + k] *= divisor/(k + offset + 1.0); + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) / nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + if (removeElementFromFilter || boundaryReflect) { + sum -= circularBuffer[beginPtr]; + } + if (boundaryReflect) { + sum += circularBuffer[boundaryPtr]; + } + + mesh[index + saveElementOffset] = sum / count; + + boundaryPtr = (boundaryPtr - 1 + divisor) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - //end point boundary condition - input.mesh[index] *= divisor/(offset + 1.0); } } } @@ -453,8 +478,8 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size currElementOffset += nextElementOffset; } - // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value - int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; // Handle last #offset elements on RHS while(saveElementOffset < currElementOffset) { @@ -575,8 +600,8 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size currElementOffset += nextElementOffset; } - // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value - int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; // Handle last #offset elements on RHS while(saveElementOffset < currElementOffset) { diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d15f6561..d2ca284b 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -176,14 +176,14 @@ namespace { // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); + calcMean(mGpu, offset, MEAN_X_DIR, hasBoundary); timer.stop_timer(); // expectedMesh because of different order of calculation will have small floating-point differences // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; } } } @@ -313,15 +313,13 @@ namespace { // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR, (hasBoundary > 0)); + calcMean(mGpu, offset, MEAN_Z_DIR, hasBoundary); timer.stop_timer(); // expectedMesh because of different order of calculation will have small floating-point differences // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! - EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) - << "---!!!!!!--- GPU values does not match"; - EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) - << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; } } @@ -329,311 +327,215 @@ namespace { } } - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Y_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for y_len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{5, 1, 1}; + float expectedData[2][5][dim.y] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + APRTimer timer(false); // set to true to see timings -// TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { -// int y_num = 1; -// int x_num = 5; -// int z_num = 1; -//#if 1 -// PixelData m(y_num, x_num, z_num, 0); -// PixelData m2(y_num, x_num, z_num, 0); -// PixelData m3(y_num, x_num, z_num, 0); -// PixelData m4(y_num, x_num, z_num, 0); -// float dataIn[] = {1, 2, 3, 4, 5}; -//// float dataIn[] = {75.4539260864, 42.5445404053, 0.00003, 4, 0.00005, 6, 0.00007, 8, 0.00009, 10, 0.000011, 12}; -// -// initFromZYXarray(m, dataIn); -// initFromZYXarray(m2, dataIn); -// initFromZYXarray(m3, dataIn); -// initFromZYXarray(m4, dataIn); -//#else -// PixelData m = getRandInitializedMesh(y_num, x_num, z_num, 200, 0); -// PixelData m2(m, true); -// PixelData m3(m, true); -// PixelData m4(m, true); -//#endif -// -// LocalIntensityScale lis; -// -// int off = 4; -// -// std::cout << "INP:"; m.printMesh(1); -// -// bool boundary = true; -// -// APRTimer timer(true); -// calcMean(m3, off, MEAN_X_DIR, boundary); -// timer.start_timer("new"); -// lis.calc_sat_mean_x(m2, off, boundary); -// timer.stop_timer(); -// -// timer.start_timer("old"); -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, 0, off, 0); -// lis.calc_sat_mean_x_orig(mCpuPadded, off); -// unpaddPixels(mCpuPadded, m4, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// std::cout << "CPU: "; m2.printMesh(1); -// std::cout << "GPU: "; m3.printMesh(1); -// std::cout << "CPU old: "; m4.printMesh(1); -// -// std::cout << "GPU vs NEW\n"; -// compareMeshes(m3, m2, 0.00000001, 3); -// std::cout << "OLD vs GPU\n"; -// compareMeshes(m4, m3, 0.00000001, 3); -// std::cout << "OLD vs NEW\n"; -// EXPECT_EQ(compareMeshes(m4, m2, 0.00000001, 3), 0); -// } - // ------------------------------------------------------------------------ + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// -// std::cout << " ============================== " << offset << std::endl; -// -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean Y-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Y-DIR"); -// calcMean(mGpu, offset, MEAN_Y_DIR); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); -// } -// } + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { + // std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { -// APRTimer timer(true); -// using ImgType = float; -// PixelData m = getRandInitializedMesh(22, 33, 22, 255); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// -// std::cout << " ============================== " << offset << std::endl; -// -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean Z-DIR"); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Z-DIR"); -// calcMean(mGpu, offset, MEAN_Z_DIR); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); -// } -// } + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { -// APRTimer timer(true); -// PixelData m(4, 4, 1, 0); -// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; -// initFromZYXarray(m, dataIn); -// -// LocalIntensityScale lis; -// -// for (int boundary = 1; boundary <= 1; ++ boundary) { -// // boundary = 0 there is no reflected boundary -// // boudnary = 1 there is boundary reflect -// std::cout << "\n\n"; -// for (int offset = 1; offset < 2; ++offset) { -// // Run on CPU -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); -// timer.start_timer("CPU mean Y-DIR"); -// lis.calc_sat_mean_y(mCpuPadded, offset); -// PixelData mCpu; -// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Y-DIR"); -// calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); -// -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); -// } -// } -// } + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Y_DIR_RANDOM_VALUES) { + APRTimer timer(false); + constexpr PixelDataDim const dim{49, 51, 53}; + PixelData m = getRandInitializedMesh(dim, 2, 0,false); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { -// APRTimer timer(true); -//// PixelData m(1, 1, 13, 0); -//// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; -//// initFromZYXarray(m, dataIn); -// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); -// -// -// LocalIntensityScale lis; -// -// for (int boundary = 0; boundary <= 1; ++ boundary) { -// // boundary = 0 there is no reflected boundary -// // boudnary = 1 there is boundary reflect -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, 0, 0, offset * boundary); -// timer.start_timer("CPU mean Z-DIR"); -// lis.calc_sat_mean_z(mCpuPadded, offset); -// PixelData mCpu; -// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Z-DIR"); -// calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); -// } -// } -// } + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; - // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // TODO: See what these tests are doing and fix/change/remove them! + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); -// TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { -// { // OFFSET=0 -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {3,6,9,12,15,18,21,24}; -// float expect[] = {3,6,9,12,15,18,21,24}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 0, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// } -// { // OFFSET=1 -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; -// float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 1, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// } -// { // OFFSET=2 (+symmetricity check) -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {3,6,9,12,15,18,21,24}; -// float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 2, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// -// // check if data in opposite order gives same result -// float dataIn2[] = {24,21,18,15,12,9,6,3}; -// float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; -// -// initFromZYXarray(m, dataIn2); -// -// calcMean(m, 2, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect2, 0.05)); -// } -// } + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Y_DIR) { + // Input params + using T = float; -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); -// } -// } + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; - //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(len, 1, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(len, 1, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Y-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, hasBoundary); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 32, 31); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 31, 13); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + + // ------------------------------------------------------------------------ + // Below tests are not yet fixed. + // ------------------------------------------------------------------------ -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); -// } -// } // TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { // APRTimer timer(true); -// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// PixelData m = getRandInitializedMesh(5, 5, 1, 25, 10, true); // // APRParameters params; // params.sigma_th = 1; // params.sigma_th_max = 2; -// params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. +// params.reflect_bc_lis = true; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. // // // Run on CPU // PixelData mCpu(m, true); @@ -652,9 +554,19 @@ namespace { // getLocalIntensityScale(mGpu, mGpuTemp, params); // timer.stop_timer(); // +// m.printMeshT(1); +// mCpu.printMeshT(1); +// mGpu.printMeshT(1); +// // // Compare results // //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000), 0); +// +// +// PixelData padd; +// paddPixels(m, padd, 2, 2, 0); +// m.printMeshT(1); +// padd.printMeshT(1); // } From 2cdf3fe6e9dcf39151feeadddfc68c3bb92c7287 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 12:52:23 +0100 Subject: [PATCH 16/80] Whole LIS pipeline is matching exactly CPU implementation + tests updated --- src/algorithm/LocalIntensityScale.cu | 71 +++++++++++++++---- src/data_structures/Mesh/PixelData.cu | 21 +++++- src/data_structures/Mesh/PixelDataCuda.h | 28 ++++++-- src/data_structures/Mesh/paddPixelData.cuh | 81 ++++++++++++++++++++++ src/misc/CudaTools.cuh | 10 ++- test/LocalIntensityScaleCudaTest.cpp | 81 ++++++++++------------ test/TestTools.hpp | 2 +- 7 files changed, 224 insertions(+), 70 deletions(-) create mode 100644 src/data_structures/Mesh/paddPixelData.cuh diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 057e4de2..ee563e33 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -11,7 +11,7 @@ //#include #include "misc/CudaTools.cuh" - +#include "data_structures/Mesh/paddPixelData.cuh" /** * Calculates mean in Y direction @@ -393,18 +393,18 @@ void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_ meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } -template -void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { +template +void runMean(T *cudaImage, const PixelDataDim dim, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { if (flags & MEAN_Y_DIR) { - runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanYdir(cudaImage, offsetY, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_X_DIR) { - runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanXdir(cudaImage, offsetX, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_Z_DIR) { - runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanZdir(cudaImage, offsetZ, dim.x, dim.y, dim.z, aStream, boundaryReflect); } } @@ -444,9 +444,9 @@ __global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float rescaled = varRescale * data[idx]; - if (rescaled < sigmaThreshold) { - rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; - } +// if (rescaled < sigmaThreshold) { +// rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; +// } data[idx] = rescaled; } } @@ -470,12 +470,53 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete size_t win_x2 = var_win[4]; size_t win_z2 = var_win[5]; + + // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. + // rescaleAndThreshold - currently there is no thresholding as in new CPU code (should it be permanent?) + // --------- CUDA ---------------- - runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); - runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); - runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + + // padd + CudaMemoryUniquePtr paddedImage; + CudaMemoryUniquePtr paddedTemp; + PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); + PixelDataDim imageSize = image.getDimension(); + PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension + + S *ci = cudaImage; + S *ct = cudaTemp; + PixelDataDim dim = image.getDimension(); + + if (par.reflect_bc_lis) { + // padd + S *mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedImage.reset(mem); + mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedTemp.reset(mem); + + runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); + + ci = paddedImage.get(); + ct = paddedTemp.get(); + dim = paddedImageSize; + } + + + runCopy1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); + runAbsDiff1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); + runRescaleAndThreshold(ci, dim.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + + if (par.reflect_bc_lis) { + // unpadd + runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); + runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + } + } template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); @@ -489,7 +530,7 @@ void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool bound ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); // timer.start_timer("GpuDeviceTimeFull"); - runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); + runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect); // timer.stop_timer(); } diff --git a/src/data_structures/Mesh/PixelData.cu b/src/data_structures/Mesh/PixelData.cu index fd27f4d5..35924482 100644 --- a/src/data_structures/Mesh/PixelData.cu +++ b/src/data_structures/Mesh/PixelData.cu @@ -10,11 +10,14 @@ #include "misc/CudaTools.cuh" #include "downsample.cuh" -#include +#include "paddPixelData.cuh" + // explicit instantiation of handled types template void downsampleMeanCuda(const PixelData&, PixelData&); template void downsampleMaxCuda(const PixelData&, PixelData&); +template void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); +template void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); template void downsampleMeanCuda(const PixelData &input, PixelData &output) { @@ -31,3 +34,19 @@ void downsampleMaxCuda(const PixelData &input, PixelData &output) { runDownsampleMax(in.get(), out.get(), input.x_num, input.y_num, input.z_num, 0); }; + +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runPaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; + +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runUnpaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; diff --git a/src/data_structures/Mesh/PixelDataCuda.h b/src/data_structures/Mesh/PixelDataCuda.h index 34f7a56c..97f2144e 100644 --- a/src/data_structures/Mesh/PixelDataCuda.h +++ b/src/data_structures/Mesh/PixelDataCuda.h @@ -1,17 +1,35 @@ -// -// Created by Krzysztof Gonciarz on 4/9/18. -// - #ifndef LIBAPR_PIXELDATACUDA_H #define LIBAPR_PIXELDATACUDA_H #include "PixelData.hpp" + template void downsampleMeanCuda(const PixelData &aInput, PixelData &aOutput); template void downsampleMaxCuda(const PixelData &input, PixelData &output); -#endif //LIBAPR_PIXELDATACUDA_H +/** + * Copies data from input to output (which is bigger by pad size) reflecting around the edge pixels. + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +/** + * Copies data from input to output (which is smaller by pad size). + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +#endif + diff --git a/src/data_structures/Mesh/paddPixelData.cuh b/src/data_structures/Mesh/paddPixelData.cuh new file mode 100644 index 00000000..dae96d79 --- /dev/null +++ b/src/data_structures/Mesh/paddPixelData.cuh @@ -0,0 +1,81 @@ +#ifndef LIBAPR_PADDPIXELDATA_CUH +#define LIBAPR_PADDPIXELDATA_CUH + + +#include "data_structures/Mesh/PixelData.hpp" + + +template +__global__ void paddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (padded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index + int yIn = yIdx - padSize.y; + if (yIn < 0) yIn = -yIn; // reflected boundary on LHS + if (yIn >= inputSize.y) yIn -= 2 * (yIn - (inputSize.y - 1)); // reflected boundary on RHS + + int xIn = xIdx - padSize.x; + if (xIn < 0) xIn = -xIn; // reflected boundary on LHS + if (xIn >= inputSize.x) xIn -= 2 * (xIn - (inputSize.x - 1)); // reflected boundary on RHS + + int zIn = zIdx - padSize.z; + if (zIn < 0) zIn = -zIn; // reflected boundary on LHS + if (zIn >= inputSize.z) zIn -= 2 * (zIn - (inputSize.z - 1)); // reflected boundary on RHS + + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runPaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + paddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +template +__global__ void unpaddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (unpadded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index (map coordinates of output cube to internal cube of padded cube) + int yIn = yIdx + padSize.y; + int xIn = xIdx + padSize.x; + int zIn = zIdx + padSize.z; + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runUnpaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + unpaddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +#endif diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 558f730a..bb17e5fa 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -94,12 +94,18 @@ public: // Useful type for keeping CUDA allocated memory (which is released with cudaFree) -template +cudaError_t CUDARTAPI deleter(void *devPtr) { + //std::cout << "cudaFree() called...\n"; + return cudaFree(devPtr); +} + +template struct CudaMemoryUniquePtr : public std::unique_ptr { using std::unique_ptr::unique_ptr; // inheriting other constructors - explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &cudaFree) {} + explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &deleter) {} }; + /** * Directions for sending data between Host and Device */ diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d2ca284b..2fa4f60c 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -4,7 +4,7 @@ #include "algorithm/LocalIntensityScaleCuda.h" #include "algorithm/LocalIntensityScale.hpp" #include "TestTools.hpp" - +#include "data_structures/Mesh/PixelDataCuda.h" namespace { @@ -522,52 +522,41 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ - - -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(5, 5, 1, 25, 10, true); -// -// APRParameters params; -// params.sigma_th = 1; -// params.sigma_th_max = 2; -// params.reflect_bc_lis = true; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. -// -// // Run on CPU -// PixelData mCpu(m, true); -// PixelData mCpuTemp(m, false); -// timer.start_timer("CPU LIS FULL"); -// -// LocalIntensityScale localIntensityScale; -// -// localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// PixelData mGpuTemp(m, false); -// timer.start_timer("GPU LIS ALL-DIR"); -// getLocalIntensityScale(mGpu, mGpuTemp, params); -// timer.stop_timer(); -// -// m.printMeshT(1); -// mCpu.printMeshT(1); -// mGpu.printMeshT(1); -// -// // Compare results -// //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000), 0); -// -// -// PixelData padd; -// paddPixels(m, padd, 2, 2, 0); -// m.printMeshT(1); -// padd.printMeshT(1); -// } + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } #endif // APR_USE_CUDA diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 4ec15afe..b533674d 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -159,7 +159,7 @@ inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier #pragma omp parallel for default(shared) #endif for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier + offset; + m.mesh[i] = useIdxNumbers ? i + 1 : dist(mt) * multiplier + offset; } return m; } From e093c01acd691b6ecf2e3a9adf25f0233b67bdbf Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 14:13:48 +0100 Subject: [PATCH 17/80] Quick fix of linking error --- src/misc/CudaTools.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index bb17e5fa..155ce317 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -94,7 +94,7 @@ public: // Useful type for keeping CUDA allocated memory (which is released with cudaFree) -cudaError_t CUDARTAPI deleter(void *devPtr) { +static cudaError_t CUDARTAPI deleter(void *devPtr) { //std::cout << "cudaFree() called...\n"; return cudaFree(devPtr); } From 053380d267bc06491e8404fa6a2d60f7a65903b7 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 15:26:58 +0100 Subject: [PATCH 18/80] maximum error diff. GPU vs CPU for compute gradient set to 0 --- test/ComputeGradientCudaTest.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 8bb06106..83502a62 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -56,7 +56,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -98,7 +98,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -140,7 +140,7 @@ namespace { timer.stop_timer(); //Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0001, 2), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -173,7 +173,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } @@ -201,7 +201,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { @@ -224,7 +224,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { @@ -247,7 +247,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { @@ -272,7 +272,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } // ======================================================================== @@ -301,7 +301,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(grad, gradCuda, 0.0000001), 0); + EXPECT_EQ(compareMeshes(grad, gradCuda, 0), 0); } @@ -354,9 +354,9 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0.0000001), 0); - EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.0000001), 0); - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.0000001), 0); + EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } #endif // APR_USE_CUDA From 97cf75e2b82ff09612b41218272b7e8e445f936e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 10:31:48 +0100 Subject: [PATCH 19/80] rescaleAndThreshold in now only rescaling (to reflect changed in CPU side) --- src/algorithm/LocalIntensityScale.cu | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index ee563e33..11eb0275 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -439,23 +439,19 @@ void runAbsDiff1D(T *data, const T *reference, size_t len, cudaStream_t aStream) } template -__global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float sigmaThreshold, float sigmaThresholdMax) { - const float max_th = 60000.0; +__global__ void rescale(T *data, size_t len, float varRescale) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float rescaled = varRescale * data[idx]; -// if (rescaled < sigmaThreshold) { -// rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; -// } data[idx] = rescaled; } } template -void runRescaleAndThreshold(T *data, size_t len, float varRescale, float sigma, float sigmaMax, cudaStream_t aStream) { +void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) { dim3 threadsPerBlock(64); dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); - rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, varRescale, sigma, sigmaMax); + rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale); } template @@ -472,7 +468,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // rescaleAndThreshold - currently there is no thresholding as in new CPU code (should it be permanent?) + // rescale - currently there is no thresholding as in new CPU code (should it be permanent?) // --------- CUDA ---------------- @@ -509,7 +505,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); runAbsDiff1D(ci, ct, dim.size(), aStream); runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); - runRescaleAndThreshold(ci, dim.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + runRescale(ci, dim.size(), var_rescale, aStream); if (par.reflect_bc_lis) { // unpadd From 83c2a3104be1253573fe26d76d69c767c3e08f9b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 10:33:25 +0100 Subject: [PATCH 20/80] rescaleAndThreshold in now only rescaling (to reflect changed in CPU side) --- src/algorithm/LocalIntensityScale.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 11eb0275..cf4b29f1 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -468,7 +468,6 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // rescale - currently there is no thresholding as in new CPU code (should it be permanent?) // --------- CUDA ---------------- From 5b5a719411b98cf0d0141eca8e8452ee6f7957c2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 16:20:24 +0100 Subject: [PATCH 21/80] constant_intensity_scale handling in LIS added for GPU --- src/algorithm/LocalIntensityScale.cu | 119 +++++++++++++++++--------- src/algorithm/LocalIntensityScale.hpp | 2 + test/LocalIntensityScaleCudaTest.cpp | 37 ++++++++ 3 files changed, 119 insertions(+), 39 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index cf4b29f1..64e4c710 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -454,11 +454,43 @@ void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) { rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale); } +template +__global__ void constantScale(S *image, size_t len) { + // This is totally naive and slow implementation (only 1 thread is used) just to have CPU + // code implemented in CUDA. This code will not be run in any normal usage of APR + // and it is just here for sanity check and or super small images cases (like few pixels) + // so DO NOT TRY TO OPTIMIZE IT - use your time for something more productive or have + // some beers... still better than writing fast version of this code. + + float min_val = 660000; + double sum = 0; + + for (size_t i = 0; i < len; ++i) { + float tmp = image[i]; + + sum += tmp; + if (tmp < min_val) min_val = tmp; + } + + float scale_val = (float) (sum / (float)len - min_val); + + for (size_t i = 0; i < len; ++i) { + image[i] = scale_val; + } +} + +template +void runConstantScale(S *image, PixelDataDim &dim) { + // Check kernel description for further info! + constantScale<<<1, 1>>>(image, dim.size()); +} + template void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream) { float var_rescale; std::vector var_win; - LocalIntensityScale().get_window_alt(var_rescale, var_win, par,image); + auto lis = LocalIntensityScale(); + lis.get_window_alt(var_rescale, var_win, par, image); size_t win_y = var_win[0]; size_t win_x = var_win[1]; size_t win_z = var_win[2]; @@ -467,51 +499,60 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete size_t win_z2 = var_win[5]; - // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // --------- CUDA ---------------- + bool constant_scale = false; - // padd - CudaMemoryUniquePtr paddedImage; - CudaMemoryUniquePtr paddedTemp; - PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); - PixelDataDim imageSize = image.getDimension(); - PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension - - S *ci = cudaImage; - S *ct = cudaTemp; - PixelDataDim dim = image.getDimension(); - - if (par.reflect_bc_lis) { - // padd - S *mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedImage.reset(mem); - mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedTemp.reset(mem); - - runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); - runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); - - ci = paddedImage.get(); - ct = paddedTemp.get(); - dim = paddedImageSize; + if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) { + // include the case where the local intensity scale doesn't make sense due to the image being to small. + // (This is for just edge cases and sanity checking) + constant_scale = true; } + PixelDataDim imageSize = image.getDimension(); - runCopy1D(ci, ct, dim.size(), aStream); - runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); - runAbsDiff1D(ci, ct, dim.size(), aStream); - runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); - runRescale(ci, dim.size(), var_rescale, aStream); + if (!constant_scale) { + CudaMemoryUniquePtr paddedImage; + CudaMemoryUniquePtr paddedTemp; + PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); + PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension + + S *ci = cudaImage; + S *ct = cudaTemp; + PixelDataDim dim = image.getDimension(); + + if (par.reflect_bc_lis) { + // padd + S *mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedImage.reset(mem); + mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedTemp.reset(mem); + + runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); + + ci = paddedImage.get(); + ct = paddedTemp.get(); + dim = paddedImageSize; + } - if (par.reflect_bc_lis) { - // unpadd - runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); - runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + // Run LIS pipeline + runCopy1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); + runAbsDiff1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); + runRescale(ci, dim.size(), var_rescale, aStream); + + if (par.reflect_bc_lis) { + // unpadd + runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); + runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + } + } + else { + runConstantScale(cudaImage, imageSize); } - } template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 3f7fffef..e576efd5 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -16,6 +16,8 @@ class LocalIntensityScale { bool active_x = true; bool active_z = true; +public: + int number_active_dimensions = 3; diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index 2fa4f60c..ce6ff111 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -558,6 +558,43 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + params.constant_intensity_scale = true; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + // NOTE: mCpuTemp and mGpuTemp are not checked since in case of + // constant_intensity_scale they are not set to any value + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } #endif // APR_USE_CUDA } From 5d0375ad59f6d9d9a2223084aecc2cc325a62dbe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 11:24:04 +0100 Subject: [PATCH 22/80] Removed unused threshold functions --- src/algorithm/ComputeGradientCuda.cu | 62 --------------------------- src/algorithm/ComputeGradientCuda.hpp | 4 -- test/ComputeGradientTest.cpp | 54 ----------------------- 3 files changed, 120 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 0a6e5507..bc7beed7 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -158,49 +158,6 @@ namespace { } } -/** - * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. - * @param input - * @param output - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] <= thresholdLevel) { output[idx] = 0; } - } -} - -template -void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); - threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); -}; - -/** - * Thresholds input array to have minimum thresholdLevel. - * @param input - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void thresholdImg(T *input, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] < thresholdLevel) { input[idx] = thresholdLevel; } - } -} - -template -void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, float Ip_th_offset, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - thresholdImg<<< numBlocks, threadsPerBlock, 0, aStream >>> (cudaImage, x_num * y_num * z_num, Ip_th_offset); -}; - template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, @@ -497,25 +454,6 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); } -// explicit instantiation of handled types -template void thresholdImg(PixelData &, const float); -template -void thresholdImg(PixelData &image, const float threshold) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - - runThresholdImg(cudaImage.get(), image.x_num, image.y_num, image.z_num, threshold, 0); -} - -// explicit instantiation of handled types -template void thresholdGradient(PixelData &, const PixelData &, const float); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th) { - ScopedCudaMemHandler, H2D> cudaInput(input); - ScopedCudaMemHandler, H2D | D2H> cudaOutput(output); - - runThreshold(cudaInput.get(), cudaOutput.get(), input.x_num, input.y_num, input.z_num, Ip_th, 0); -} - void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { ScopedCudaMemHandler, H2D | D2H> cudaInput(input); ScopedCudaMemHandler, D2H> cudaGrad(grad); diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 36bb70b1..1fcf088b 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -32,10 +32,6 @@ template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx = 1, float dy = 1, float dz = 1); template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par); -template -void thresholdImg(PixelData &image, const float threshold); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th); void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz); template diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 527815f0..9ba510e6 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -648,60 +648,6 @@ namespace { #ifdef APR_USE_CUDA - TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(31, 33, 13); - PixelData g = getRandInitializedMesh(31, 33, 13); - float thresholdLevel = 1; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - ComputeGradient().threshold_gradient(mCpu, m, thresholdLevel); - - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdGradient(mGpu, m, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeThreshold, CALC_THRESHOLD_IMG_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData g = getRandInitializedMesh(31, 33, 13, 1, true); - - float thresholdLevel = 10; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - for (size_t i = 0; i < mCpu.mesh.size(); ++i) { - if (mCpu.mesh[i] <= (thresholdLevel)) { mCpu.mesh[i] = thresholdLevel; } - } - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdImg(mGpu, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - // TODO: This test will be fixed as soon as CUDA pipeline is updated. // Currently turning it off to have testable rest of CUDA impl. // TEST(ComputeThreshold, FULL_PIPELINE_TEST) { From 53ef94baa6c748bcebb35ce11cdc5d1ddb267823 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 13:39:51 +0100 Subject: [PATCH 23/80] FullPipeline test moved to new file --- src/algorithm/ComputeGradientCuda.cu | 3 +- test/CMakeLists.txt | 1 + test/ComputeGradientCudaTest.cpp | 2 +- test/ComputeGradientTest.cpp | 74 -------------------------- test/FullPipelineCudaTest.cpp | 79 ++++++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 test/FullPipelineCudaTest.cpp diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index bc7beed7..87ebdaa5 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -266,6 +266,7 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); image.copyH2D(); + checkCuda(cudaStreamSynchronize(iStream)); std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } @@ -273,7 +274,7 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); local_scale_temp.copyD2H(); - cudaStreamSynchronize(iStream); + checkCuda(cudaStreamSynchronize(iStream)); std::cout << "RCV time: " << ct.microseconds() - start << std::endl; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2918f2c5..193ce405 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,6 +16,7 @@ if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) + buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 83502a62..588c5ea3 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -314,7 +314,7 @@ namespace { // Generate random mesh using ImageType = uint16_t; - PixelData input_image = getRandInitializedMesh(11, 13, 15, 15, 20); + PixelData input_image = getRandInitializedMesh(33, 35, 37, 15, 20); PixelData &image_temp = input_image; PixelData grad_temp; // should be a down-sampled image diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 9ba510e6..ca60fca3 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -2,13 +2,10 @@ * Created by Krzysztof Gonciarz 2018 */ #include -#include #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/ComputeGradient.hpp" -#include "algorithm/ComputeGradientCuda.hpp" #include -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" namespace { @@ -641,77 +638,6 @@ namespace { ASSERT_TRUE(compare(m, expect, 0.01)); } - - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - -#ifdef APR_USE_CUDA - - // TODO: This test will be fixed as soon as CUDA pipeline is updated. - // Currently turning it off to have testable rest of CUDA impl. -// TEST(ComputeThreshold, FULL_PIPELINE_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); -// int maxLevel = ceil(std::log2(330)); -// -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.sigma_th = 0; -// par.sigma_th_max = 0; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// ComputeGradient computeGradient; -// LocalIntensityScale localIntensityScale; -// LocalParticleCellSet localParticleSet; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// localIntensityScale.get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); -// localParticleSet.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); -// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); -// gpt.doAll(); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// // allow some differences since float point diffs -// // TODO: It would be much better to count number of diffs with delta==1 and allow some of these -// EXPECT_TRUE(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.01) < 29); -// } - - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp new file mode 100644 index 00000000..31312080 --- /dev/null +++ b/test/FullPipelineCudaTest.cpp @@ -0,0 +1,79 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "TestTools.hpp" +#include "data_structures/Mesh/PixelDataCuda.h" +#include "algorithm/APRConverter.hpp" + +namespace { +#ifdef APR_USE_CUDA + + TEST(ComputeThreshold, FULL_PIPELINE_TEST) { + APRTimer timer(true); + + // Generate random mesh + using ImageType = float; + PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); + int maxLevel = ceil(std::log2(330)); + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpuImage(input_image, true); + +// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); +// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); +// gpt.doAll(); + + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } + + +#endif // APR_USE_CUDA +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file From ac2c22e54f32f40afb6fe5a9030cb9c6bf57ea96 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 13:49:25 +0100 Subject: [PATCH 24/80] PixelDataDim updated with maximum dimension lenght and nuber of dimensions --- src/data_structures/Mesh/PixelData.hpp | 2 ++ test/MeshDataTest.cpp | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 68de3b00..d3867ed7 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -37,6 +37,8 @@ struct PixelDataDim { constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } + size_t maxDimSize() const { return std::max(x, std::max(y, z)); } + int numOfDimensions() const { return (int)(x > 1) + (int)(y > 1) + (int)(z > 1); } PixelDataDim operator+(const PixelDataDim &rhs) const { return {y + rhs.y, x + rhs.x, z + rhs.z}; } PixelDataDim operator-(const PixelDataDim &rhs) const { return {y - rhs.y, x - rhs.x, z - rhs.z}; } diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index a3c4bec6..20b1bbe3 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -35,6 +35,7 @@ namespace { ASSERT_EQ(d.x, 20); ASSERT_EQ(d.z, 30); ASSERT_EQ(d.size(), 10*20*30); + ASSERT_EQ(d.maxDimSize(), 30); } { // adding int to all dims @@ -81,6 +82,16 @@ namespace { ASSERT_FALSE(x == z); ASSERT_TRUE(x != z); } + { // number of dimensions + const PixelDataDim x = {2, 3, 5}; + const PixelDataDim y = {2, 1, 5}; + const PixelDataDim z = {1, 4, 1}; + const PixelDataDim w = {1, 1, 1}; + ASSERT_EQ(x.numOfDimensions(), 3); + ASSERT_EQ(y.numOfDimensions(), 2); + ASSERT_EQ(z.numOfDimensions(), 1); + ASSERT_EQ(w.numOfDimensions(), 0); + } } TEST_F(VectorDataTest, InitTest) { From 122a96a13524e956e9383f4263f1dc041cf081fe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 16:06:41 +0100 Subject: [PATCH 25/80] GradLisLevels test working now --- test/FullPipelineCudaTest.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 31312080..ede7ee12 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -9,16 +9,17 @@ #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" + namespace { #ifdef APR_USE_CUDA - TEST(ComputeThreshold, FULL_PIPELINE_TEST) { + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) { APRTimer timer(true); - // Generate random mesh + // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); - int maxLevel = ceil(std::log2(330)); + PixelData input_image = getRandInitializedMesh(1000, 1000, 1000, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); PixelData grad_temp; // should be a down-sampled image grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); @@ -54,11 +55,7 @@ namespace { // Calculate bspline on GPU PixelData mGpuImage(input_image, true); - -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); -// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); -// gpt.doAll(); - + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); From 6a5db358d6a79092bb669212458125a7c9efec6f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 24 Mar 2023 13:02:12 +0100 Subject: [PATCH 26/80] full pipeline tests fixed --- src/algorithm/ComputeGradientCuda.cu | 90 +++++++++++++------------ src/algorithm/ComputeGradientCuda.hpp | 2 +- src/algorithm/LocalIntensityScale.cu | 16 +++-- src/algorithm/LocalIntensityScaleCuda.h | 1 - src/algorithm/bsplineXdir.cuh | 2 +- src/algorithm/bsplineYdir.cuh | 2 +- src/algorithm/bsplineZdir.cuh | 2 +- src/data_structures/Mesh/PixelData.hpp | 8 +++ test/FullPipelineCudaTest.cpp | 58 +++++++++++++++- 9 files changed, 125 insertions(+), 56 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 87ebdaa5..4db49d4d 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -130,11 +130,11 @@ namespace { }; } - auto transferSpline(BsplineParams &aParams) { - ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); - ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); - ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); - ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0, aStream); return std::pair { BsplineParamsCuda { @@ -164,10 +164,13 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { + // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim + runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); runBsplineXdir(cudaImage, image.getDimension(), px, aStream); runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); @@ -195,6 +198,7 @@ public: } }; + template template class GpuProcessingTask::GpuProcessingTaskImpl { @@ -234,11 +238,11 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: - GpuProcessingTaskImpl(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : - iCpuImage(image), + GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : + iCpuImage(inputImage), iCpuLevels(levels), iStream(getStream()), - image (image, iStream), + image (inputImage, iStream), gradient (levels, iStream), local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), @@ -247,19 +251,17 @@ public: iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - params(prepareBsplineStuff((size_t)image.x_num, parameters.lambda, tolerance)), + params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), bc1(params.bc1.get(), params.k0, iStream), bc2(params.bc2.get(), params.k0, iStream), bc3(params.bc3.get(), params.k0, iStream), bc4(params.bc4.get(), params.k0, iStream), - boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)image.x_num * (size_t)image.z_num}, + boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream} { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; std::cout << iCpuLevels << std::endl; - std::cout << "\n\n\n"; - } void sendDataToGpu() { @@ -286,13 +288,13 @@ public: // In principle this is OK and correct but would be faster (for processing series of same size images) if // they would be calculated in constructor of GpuProcessingTaskImpl class (once). BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); - auto cudax = transferSpline(px); + auto cudax = transferSpline(px, iStream); auto splineCudaX = cudax.first; BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); - auto cuday = transferSpline(py); + auto cuday = transferSpline(py, iStream); auto splineCudaY = cuday.first; BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); - auto cudaz = transferSpline(pz); + auto cudaz = transferSpline(pz, iStream); auto splineCudaZ = cudaz.first; getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), @@ -315,7 +317,7 @@ public: }; template -GpuProcessingTask::GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) +GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";} template @@ -359,28 +361,27 @@ template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; - - ScopedCudaMemHandler, D2H | H2D> cudaInput(input); + ScopedCudaMemHandler, D2H | H2D> cudaInput(input, aStream); APRTimer timer(false); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); // allocate memory on device runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } if (flags & BSPLINE_Z_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } @@ -391,16 +392,18 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera template void cudaInverseBspline(PixelData &, TypeOfInvBsplineFlags); template void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); if (flags & INV_BSPLINE_Y_DIR) { - runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_X_DIR) { - runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_Z_DIR) { - runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } } @@ -408,13 +411,14 @@ void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) template void computeLevelsCuda(const PixelData &, PixelData &, int, float, float, float, float); template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx, float dy, float dz) { - ScopedCudaMemHandler, H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp, aStream); float min_dim = std::min(dy, std::min(dx, dz)); float level_factor = pow(2, maxLevel) * min_dim; const float mult_const = level_factor/relError; - cudaStream_t aStream = 0; runComputeLevels(cudaGrad.get(), cudaLis.get(), grad_temp.mesh.size(), mult_const, aStream); } @@ -424,17 +428,17 @@ template void getGradient(PixelData &, PixelData &, PixelDat template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par) { - ScopedCudaMemHandler, D2H | H2D> cudaImage(image); - ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); + cudaStream_t aStream = 0; + ScopedCudaMemHandler, D2H | H2D> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2, aStream); int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); float tolerance = 0.0001; - // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. @@ -442,22 +446,24 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance); - auto cudax = transferSpline(px); + auto cudax = transferSpline(px, aStream); auto splineCudaX = cudax.first; BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance); - auto cuday = transferSpline(py); + auto cuday = transferSpline(py, aStream); auto splineCudaY = cuday.first; BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); - auto cudaz = transferSpline(pz); + auto cudaz = transferSpline(pz, aStream); auto splineCudaZ = cudaz.first; getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream); } void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); - ScopedCudaMemHandler, D2H> cudaGrad(grad); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); + ScopedCudaMemHandler, D2H> cudaGrad(grad, aStream); - runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, 0); + runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream); } diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 1fcf088b..a8ebe1bf 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -42,7 +42,7 @@ class GpuProcessingTask { public: - GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); + GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 64e4c710..2b5c186d 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -563,11 +563,11 @@ template void runLocalIntensityScalePipeline(const PixelData // TODO: should be moved somewhere template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - APRTimer timer(true); -// timer.start_timer("GpuDeviceTimeFull"); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect); -// timer.stop_timer(); } // explicit instantiation of handled types @@ -577,9 +577,11 @@ template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - ScopedCudaMemHandler, D2H> cudaTemp(temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H> cudaTemp(temp, aStream); - runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), 0); + runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), aStream); } template void getLocalIntensityScale(PixelData&, PixelData&, const APRParameters&); diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h index 135e5927..f572d5e5 100644 --- a/src/algorithm/LocalIntensityScaleCuda.h +++ b/src/algorithm/LocalIntensityScaleCuda.h @@ -15,7 +15,6 @@ constexpr TypeOfMeanFlags MEAN_X_DIR = 0x02; constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04; constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR; -// TODO: remember to revert by default boundaryReflect=true (or check with CPU code what is current 'default'). template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false); diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index 89fd3fc6..1df52a80 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -137,7 +137,7 @@ void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaSt // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineXdir <<>>(cudaImage, dim, p, error.get()); } diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index e8aa5bdf..b487cb63 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -249,7 +249,7 @@ void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); sharedMemSize = numOfThreads * blockWidth * sizeof(float); bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index c8ba6688..43550ff8 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -139,7 +139,7 @@ void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaSt // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineZdir <<>> (cudaImage, dim, p, error.get()); } diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index d3867ed7..9b68458d 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -725,6 +725,10 @@ public : init(y_num_ds, x_num_ds, z_num_ds, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided dimensions (rounding up if not divisible by 2) and initialize values * @param aSizeOfY @@ -740,6 +744,10 @@ public : initWithValue(y_num_ds, x_num_ds, z_num_ds, aInitVal, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, T aInitVal, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aInitVal, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided mesh dimensions (rounding up if not divisible by 2) * @param aMesh - mesh used to get dimensions diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index ede7ee12..6528227a 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -18,7 +18,7 @@ namespace { // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(1000, 1000, 1000, 13); + PixelData input_image = getRandInitializedMesh(100, 100, 100, 13); int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); PixelData grad_temp; // should be a down-sampled image @@ -65,7 +65,61 @@ namespace { EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GPT) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 99, 0, false); + int maxLevel = ceil(std::log2(dim.maxDimSize())); + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(dim,false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(dim, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(dim, false); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(dim, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + + // Calculate bspline on GPU + PixelData mGpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + + { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.doAll(); + } + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } #endif // APR_USE_CUDA } @@ -73,4 +127,4 @@ namespace { int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} \ No newline at end of file +} From 4088e9d8bc043af0451807804e9dc0715c8fab89 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 20 Jul 2023 13:31:38 +0200 Subject: [PATCH 27/80] Changes from old branches added + modified to GenInfo instead of APRAccess --- src/algorithm/OVPC.h | 4 +- test/CMakeLists.txt | 1 + test/PullingSchemeTest.cpp | 303 +++++++++++++++++++++++++++++-------- 3 files changed, 246 insertions(+), 62 deletions(-) diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index f8e975ac..6925f325 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -9,7 +9,7 @@ #include #include "data_structures/Mesh/PixelData.hpp" -#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/GenInfo.hpp" #include "algorithm/PullingScheme.hpp" @@ -33,7 +33,7 @@ class OVPC { public: template - OVPC(const APRAccess &aAprAccess, const PixelData &aInputLevels) { + OVPC(const GenInfo &aAprAccess, const PixelData &aInputLevels) { // Level Max is one less since we are working on downsampled version iLevelMax = aAprAccess.l_max - 1; iLevelMin = aAprAccess.l_min; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 193ce405..aeb66421 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,6 +17,7 @@ if(APR_USE_CUDA) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) + buildTarget(testPullingSchemeCuda PullingSchemeTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index f72897cd..a84fcdf4 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -4,13 +4,13 @@ #include #include "data_structures/Mesh/PixelData.hpp" -//TODO: only APRAccess.hpp should be included here but currently because of dependencies it does not work :( -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" -//#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/access/APRAccessStructures.hpp" #include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" #include "TestTools.hpp" + #ifdef APR_USE_CUDA +#include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" #endif @@ -18,7 +18,7 @@ namespace { template PixelData generateLevels(const PixelData &dimsMesh, int maxLevel) { PixelData levels(dimsMesh, false); - for (size_t i = 0; i < levels.mesh.size(); ++i) { + for (int i = 0; i < levels.mesh.size(); ++i) { levels.mesh[i] = ( i/2 ) % (maxLevel + 2); } // std::cout << "LEVELS: " << std::endl; @@ -26,86 +26,269 @@ namespace { return levels; } -// void printParticleCellTree(const std::vector> &particleCellTree) { -// for (int l = 0; l < particleCellTree.size(); ++l) { -// auto &tree = particleCellTree[l]; -// std::cout << "------ 1level=" << l << " " << tree << std::endl; -// tree.printMesh(3,0); -// } -// } + template + void printParticleCellTree(const std::vector> &particleCellTree) { + for (int l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } + } - TEST(PullingSchemeTest, Init) { + template + inline int compareParticleCellTrees(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected.mesh.size(); ++i) { + if (expected.mesh[i] < 8) { + if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || + std::isnan(tested.mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected.mesh[i] > 0) numOfParticles++; + } + } + std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " Particles:" << numOfParticles << std::endl; + return cnt; + } - GenInfo aprInfo; + TEST(PullingSchemeTest, NEWvsOLD) { + GenInfo access; + access.l_max = 9; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max); + access.org_dims[1] = std::pow(2, access.l_max); + access.org_dims[2] = std::pow(2, access.l_max); + int l = access.l_max - 1; - aprInfo.l_max = 4; - aprInfo.l_min = 2; - aprInfo.org_dims[0] = 8; - aprInfo.org_dims[1] = 16; - aprInfo.org_dims[2] = 1; + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - std::vector> &pctree = ps.getParticleCellTree(); - - // TEST: check if zeroed and correct number of levels - ASSERT_EQ(aprInfo.l_max, pctree.size()); // all levels [0, access.level_max - 1] - for (size_t l = 0; l < pctree.size(); ++l) { - auto &tree = pctree[l]; - for (auto &e : tree.mesh) { - ASSERT_EQ(0, e); - } - } +// levels.printMeshT(3, 1); - // Generate mesh with test levels - PixelData levels = generateLevels(pctree[aprInfo.l_max - 1], aprInfo.l_max); + APRTimer t(true); - // Fill particle cell tree with levels - int l_max = aprInfo.l_max - 1; - int l_min = aprInfo.l_min; + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; ps.fill(l_max, levels); - - PixelData levelsDS; + PixelData levelsDS; for(int l_ = l_max - 1; l_ >= l_min; l_--){ - //down sample the resolution level k, using a max reduction downsample(levels, levelsDS, [](const float &x, const float &y) -> float { return std::max(x, y); }, [](const float &x) -> float { return x; }, true); ps.fill(l_,levelsDS); levels.swap(levelsDS); } + ps.pulling_scheme_main(); + t.stop_timer(); + + t.start_timer("OVPC1"); + OVPC nps(access, levels2); + t.stop_timer(); + t.start_timer("OVPC2"); + nps.generateTree(); + t.stop_timer(); + +// printParticleCellTree(nps.getParticleCellTree()); +// printParticleCellTree(ps.getParticleCellTree()); + + for (l = l_min; l <= l_max; ++l) + compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + + } + +// TEST(PullingSchemeTest, Init) { +// +// GenInfo access; +// access.l_max = 5; +// access.l_min = 1; +// access.org_dims[0] = 32; +// access.org_dims[1] = 1; +// access.org_dims[2] = 1; // +// PullingScheme ps; +// ps.initialize_particle_cell_tree(access); +// std::vector> &pctree = ps.getParticleCellTree(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Initialized tree:\n"; // printParticleCellTree(pctree); -// ps.fill_neighbours(l_max); -// pctree[l_max].printMesh(3, 0); +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +// // TEST: check if zeroed and correct number of levels +// ASSERT_EQ(access.l_max, pctree.size()); // all levels [0, access.level_max - 1] +// for (int l = 0; l < pctree.size(); ++l) { +// auto &tree = pctree[l]; +// for (auto &e : tree.mesh) { +// ASSERT_EQ(0, e); +// } +// } +// +// // Generate mesh with test levels +// PixelData levels(pctree.back(), false);// = generateLevels(pctree[access.l_max - 1], access.l_max); +//// float values[] = {4, 1, 1, 1, 1, 1, 1, 2}; +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); +// +// +// OVPC nps(access, levels); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; +// printParticleCellTree(nps.getParticleCellTree()); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; +// nps.generateTree(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; +// printParticleCellTree(nps.getParticleCellTree()); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; +// // Fill particle cell tree with levels +// int l_max = access.l_max - 1; +// int l_min = access.l_min; +// ps.fill(l_max, levels); +// +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; +// levels.printMeshT(3,0); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; +// +// PixelData levelsDS; +// for(int l_ = l_max - 1; l_ >= l_min; l_--){ +// //down sample the resolution level k, using a max reduction +// downsample(levels, levelsDS, +// [](const float &x, const float &y) -> float { return std::max(x, y); }, +// [](const float &x) -> float { return x; }, true); +// levelsDS.printMeshT(3, 0); +// ps.fill(l_,levelsDS); +// levelsDS.printMeshT(3,0); +// levels.swap(levelsDS); +// } +// +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Filled tree:\n"; +// printParticleCellTree(pctree); +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +//// ps.fill_neighbours(l_max); +//// pctree[l_max].printMesh(3, 0); +// +// // ps.pulling_scheme_main(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> MAIN tree:\n"; // printParticleCellTree(pctree); - } +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +// access.initialize_structure_from_particle_cell_tree(false, ps.getParticleCellTree()); +// std::cout << "NUM OF PARTICLES: " << access.get_total_number_particles() << std::endl; +// +// +// APRIterator apr_iterator(access); +// std::cout << "Total number of particles: " << apr_iterator.total_number_particles() << std::endl; +// +// int prev = 0; +// for (unsigned int level = apr_iterator.level_min(); level <= apr_iterator.level_max(); ++level) { +// std::cout << "Level: " << level << std::endl; +// int w = (int) (std::pow(2, 5-level) * 3); +// for (int z = 0; z < apr_iterator.spatial_index_z_max(level); ++z) { +// for (int x = 0; x < apr_iterator.spatial_index_x_max(level); ++x) { +// for (apr_iterator.set_new_lzx(level, z, x); apr_iterator.global_index() < apr_iterator.end_index; apr_iterator.set_iterator_to_particle_next_particle()) { +// for (int i = prev; i < apr_iterator.y(); ++i ) std::cout << std::setw(w) << "."; +// std::cout << std::setw(w) << apr_iterator.y(); +// prev = apr_iterator.y() + 1; +// } +// for (int pp = prev; pp < apr_iterator.spatial_index_y_max(level); ++pp) +// std::cout << std::setw(w) << "."; +// +// prev = 0; +// std::cout << std::endl; +// } +// std::cout << std::endl; +// } +// } +// +// } + #ifdef APR_USE_CUDA - TEST(PullingSchemeTest, computeLevels) { - using ImgType = float; - const int maxLevel = 3; - const float relError = 0.1; +// TEST(PullingSchemeTest, computeLevels) { +// using ImgType = float; +// const int maxLevel = 3; +// const float relError = 0.1; +// +// PixelData grad = getRandInitializedMesh(10, 20, 33); +// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); +// +// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); +// PixelData elo(localIntensityScaleCpu, true); +// APRTimer timer(true); +// +// timer.start_timer("CPU Levels"); +// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); +// timer.stop_timer(); +// +// timer.start_timer("GPU Levels"); +// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); +// timer.stop_timer(); +// +// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); +// } - PixelData grad = getRandInitializedMesh(10, 20, 33); - PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); - PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); - PixelData elo(localIntensityScaleCpu, true); - APRTimer timer(true); - LocalParticleCellSet localParticleCellSet; + TEST(PullingSchemeTest, DS) { + GenInfo access; + access.l_max = 11; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max)/2; + access.org_dims[1] = std::pow(2, access.l_max)/2; + access.org_dims[2] = std::pow(2, access.l_max); - timer.start_timer("CPU PS FULL"); - localParticleCellSet.computeLevels(grad, localIntensityScaleCpu, maxLevel, relError,1,1,1); - timer.stop_timer(); - timer.start_timer("GPU PS FULL"); - computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); - timer.stop_timer(); + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); + + // PixelData levels(16,1,1); +// float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; +// initFromZYXarray(levels, values); + + APRTimer t(true); + if (false) { + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; + ps.fill(l_max, levels2); + PixelData levelsDS; + for (int l_ = l_max - 1; l_ >= l_min; l_--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_, levelsDS); + levels2.swap(levelsDS); + } + t.stop_timer(); + } + { + t.start_timer("CUDA"); + int levelMax = access.l_max - 1; + int levelMin = access.l_min; + PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); + std::cout << levels << std::endl; +// std::cout << ds << std::endl; + computeOVPC(levels, ds, levelMin, levelMax); +// ds.printMeshT(3,1); + t.stop_timer(); + } + { + t.start_timer("OVPC1"); + OVPC nps(access, levels); + nps.generateTree(); + t.stop_timer(); +// printParticleCellTree(nps.getParticleCellTree()); + } + } - EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); - } #endif } From b8f250404dfe697eaea8a5730927f2dfde00c668 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 21 Jul 2023 13:00:56 +0200 Subject: [PATCH 28/80] Added debug printout to GenInfo --- src/data_structures/APR/GenInfo.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index f8fd090e..ba8ccb3a 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -5,6 +5,11 @@ #ifndef LIBAPR_GENINFO_HPP #define LIBAPR_GENINFO_HPP + +#include +#include +#include + //Note this function sets up the domain for the APR for a given input size. class GenInfo { @@ -97,6 +102,25 @@ class GenInfo { z_num[l] = ceil(z_org / cellSize); } } + + friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) { + os << "GenInfo {\n"; + os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n"; + os << " Number of dimensions: " << static_cast(gi.number_dimensions) << "\n"; + os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n"; + os << " total number of particles: " << gi.total_number_particles << "\n"; + os << " y_num, x_num, z_num:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level [" << l << "] = " << gi.y_num[l] << ", " << gi.x_num[l] << ", " << gi.z_num[l] << "\n"; + } + os << " level_size:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level " << l << ": " << gi.level_size[l] << "\n"; + } + os << "}"; + + return os; + } }; From 6400a9a83a07343cdc1b7c979d4bf1478e858137 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 11 Aug 2023 15:35:05 +0200 Subject: [PATCH 29/80] Moved old CUDA tests to new file --- test/CMakeLists.txt | 2 +- test/PullingSchemeCudaTest.cpp | 93 ++++++++++++++++++++++++++++++++++ test/PullingSchemeTest.cpp | 84 ------------------------------ 3 files changed, 94 insertions(+), 85 deletions(-) create mode 100644 test/PullingSchemeCudaTest.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index aeb66421..e1f2817e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,7 +17,7 @@ if(APR_USE_CUDA) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) - buildTarget(testPullingSchemeCuda PullingSchemeTest.cpp) + buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp new file mode 100644 index 00000000..5dedc414 --- /dev/null +++ b/test/PullingSchemeCudaTest.cpp @@ -0,0 +1,93 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" + +#include "algorithm/PullingSchemeCuda.hpp" +#include "algorithm/ComputeGradientCuda.hpp" + +#include "TestTools.hpp" + +// TEST(PullingSchemeTest, computeLevels) { +// using ImgType = float; +// const int maxLevel = 3; +// const float relError = 0.1; +// +// PixelData grad = getRandInitializedMesh(10, 20, 33); +// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); +// +// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); +// PixelData elo(localIntensityScaleCpu, true); +// APRTimer timer(true); +// +// timer.start_timer("CPU Levels"); +// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); +// timer.stop_timer(); +// +// timer.start_timer("GPU Levels"); +// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); +// timer.stop_timer(); +// +// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); +// } + + + +TEST(PullingSchemeTest, DS) { + GenInfo access; + access.l_max = 11; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max)/2; + access.org_dims[1] = std::pow(2, access.l_max)/2; + access.org_dims[2] = std::pow(2, access.l_max); + + + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); + + // PixelData levels(16,1,1); + // float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; + // initFromZYXarray(levels, values); + + APRTimer t(true); + if (false) { + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; + ps.fill(l_max, levels2); + PixelData levelsDS; + for (int l_ = l_max - 1; l_ >= l_min; l_--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_, levelsDS); + levels2.swap(levelsDS); + } + t.stop_timer(); + } + { + t.start_timer("CUDA"); + int levelMax = access.l_max - 1; + int levelMin = access.l_min; + PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); + std::cout << levels << std::endl; + // std::cout << ds << std::endl; + computeOVPC(levels, ds, levelMin, levelMax); + // ds.printMeshT(3,1); + t.stop_timer(); + } + { + t.start_timer("OVPC1"); + OVPC nps(access, levels); + nps.generateTree(); + t.stop_timer(); + // printParticleCellTree(nps.getParticleCellTree()); + } +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index a84fcdf4..50fef13c 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -9,10 +9,6 @@ #include "algorithm/OVPC.h" #include "TestTools.hpp" -#ifdef APR_USE_CUDA -#include "algorithm/PullingSchemeCuda.hpp" -#include "algorithm/ComputeGradientCuda.hpp" -#endif namespace { template @@ -209,87 +205,7 @@ namespace { // // } -#ifdef APR_USE_CUDA -// TEST(PullingSchemeTest, computeLevels) { -// using ImgType = float; -// const int maxLevel = 3; -// const float relError = 0.1; -// -// PixelData grad = getRandInitializedMesh(10, 20, 33); -// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); -// -// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); -// PixelData elo(localIntensityScaleCpu, true); -// APRTimer timer(true); -// -// timer.start_timer("CPU Levels"); -// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); -// timer.stop_timer(); -// -// timer.start_timer("GPU Levels"); -// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); -// timer.stop_timer(); -// -// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); -// } - - - - TEST(PullingSchemeTest, DS) { - GenInfo access; - access.l_max = 11; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max)/2; - access.org_dims[1] = std::pow(2, access.l_max)/2; - access.org_dims[2] = std::pow(2, access.l_max); - - - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); - - // PixelData levels(16,1,1); -// float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; -// initFromZYXarray(levels, values); - - APRTimer t(true); - if (false) { - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels2); - PixelData levelsDS; - for (int l_ = l_max - 1; l_ >= l_min; l_--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_, levelsDS); - levels2.swap(levelsDS); - } - t.stop_timer(); - } - { - t.start_timer("CUDA"); - int levelMax = access.l_max - 1; - int levelMin = access.l_min; - PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); - std::cout << levels << std::endl; -// std::cout << ds << std::endl; - computeOVPC(levels, ds, levelMin, levelMax); -// ds.printMeshT(3,1); - t.stop_timer(); - } - { - t.start_timer("OVPC1"); - OVPC nps(access, levels); - nps.generateTree(); - t.stop_timer(); -// printParticleCellTree(nps.getParticleCellTree()); - } - } -#endif } int main(int argc, char **argv) { From 4b35b8eac84364ca56b2ea3fecd085c722556101 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 11 Aug 2023 15:35:49 +0200 Subject: [PATCH 30/80] Moved old CUDA tests to new file --- test/PullingSchemeCudaTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5dedc414..c956a53f 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -8,7 +8,7 @@ #include "TestTools.hpp" -// TEST(PullingSchemeTest, computeLevels) { +// TEST(PullingSchemeCudaTest, computeLevels) { // using ImgType = float; // const int maxLevel = 3; // const float relError = 0.1; @@ -33,7 +33,7 @@ -TEST(PullingSchemeTest, DS) { +TEST(PullingSchemeCudaTest, DS) { GenInfo access; access.l_max = 11; access.l_min = 1; From 1ed5d4f91ca708ef75f408a348d40fb9578ac415 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 30 Oct 2023 12:43:34 +0100 Subject: [PATCH 31/80] Added CUDA_ARCHITECTURES set to OFF (keep current behaviour) to suppress warning --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4513e07f..d0aee009 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,6 +237,7 @@ if(APR_BUILD_STATIC_LIB) # generate static library used as a intermediate step in generating fat lib set(STATIC_TARGET_NAME staticLib) add_library(${STATIC_TARGET_NAME} STATIC $ ${APR_CUDA_SOURCE_FILES}) + set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) @@ -258,7 +259,7 @@ if(APR_BUILD_SHARED_LIB) # generate fat shared library set(SHARED_TARGET_NAME sharedLib) add_library(${SHARED_TARGET_NAME} SHARED $ ${APR_CUDA_SOURCE_FILES}) - + set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_include_directories(${SHARED_TARGET_NAME} PUBLIC $ $) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME}) From 93ac1206537dec79cc193571134474581ecaa410 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 8 Nov 2023 17:07:39 +0100 Subject: [PATCH 32/80] Temporary test updated to print particles using LinearAccess iterator --- src/data_structures/APR/GenInfo.hpp | 10 ++ test/PullingSchemeTest.cpp | 204 +++++++++++++++++++++++++++- test/TestTools.hpp | 2 +- 3 files changed, 214 insertions(+), 2 deletions(-) diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index ba8ccb3a..e506100a 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -34,6 +34,11 @@ class GenInfo { std::vector level_size; // precomputation of the size of each level, used by the iterators. + //initialize the information given the original dimensions + void init(const PixelDataDim &dim) { + init(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init(uint64_t y_org,uint64_t x_org,uint64_t z_org){ @@ -69,6 +74,11 @@ class GenInfo { } } + //initialize the information given the original dimensions + void init_tree(const PixelDataDim &dim){ + init_tree(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init_tree(uint64_t y_org,uint64_t x_org,uint64_t z_org){ diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index 50fef13c..83d97366 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,6 +8,7 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" +#include "algorithm/APRConverter.hpp" namespace { @@ -24,7 +25,7 @@ namespace { template void printParticleCellTree(const std::vector> &particleCellTree) { - for (int l = 0; l < particleCellTree.size(); ++l) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { auto &tree = particleCellTree[l]; // std::cout << "-- level = " << l << ", " << tree << std::endl; tree.printMeshT(3,0); @@ -51,6 +52,207 @@ namespace { return cnt; } + // ------------------------------------------------------------------------ + + TEST(PullingSchemeTest, DeleteMeAfterDeevelopment) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; + // PS values for above 'image': int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.01; + aprConverter.par.lambda = 0.1; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; + } + + TEST(PullingSchemeTest, PullingScheme1D) { + + //int values[] = {4,4,1,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }; +// int values[] = {3,2,2,2, 2,2,1,1}; +// int values[] = {3,0,0,0, 0,0,0,0}; +// int values[] = {3,0,0,0, 0,0,0,0}; +// int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + int values[] = {0,2,2,3, 4,5,6,7}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len ,1, 1); + initFromZYXarray(levels, values); + levels.printMeshT(3, 1); + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y * 2, dim.x, dim.z); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + ps.fill(l_max, levels); + std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + PixelData levelsDS; + for(int l = l_max - 1; l >= l_min; l--){ + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l, levelsDS); + std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); + } + printParticleCellTree(ps.getParticleCellTree()); + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + std::cout << "1\n"; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + std::cout << "2\n"; + LinearIterator it(linearAccess, gi); + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + } + + TEST(PullingSchemeTest, Simple) { + GenInfo gi; + // TODO: Investigate why OVPC fails if one of the dimension is equal to 1 + // Investigate why sub-dimension in printParticleCellTree is different in OVPC nad PS + gi.init(8, 1, 2); + + std::cout << gi << std::endl; + + PixelData levels = getRandInitializedMesh( + std::ceil(gi.org_dims[0]/2), + std::ceil(gi.org_dims[1]/2), + std::ceil(gi.org_dims[2]/2), + gi.l_max + 1); + PixelData levels2(levels, true); +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); + +// levels.printMeshT(3, 1); + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + ps.fill(l_max, levels); + PixelData levelsDS; + for(int l_ = l_max - 1; l_ >= l_min; l_--){ + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_,levelsDS); + levels.swap(levelsDS); + } + ps.pulling_scheme_main(); + t.stop_timer(); + + t.start_timer("OVPC1"); + OVPC nps(gi, levels2); + t.stop_timer(); + t.start_timer("OVPC2"); + nps.generateTree(); + t.stop_timer(); + + std::cout << "----------OVPC:\n"; + printParticleCellTree(nps.getParticleCellTree()); + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + for (int l = l_min; l <= l_max; ++l) + compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + + } + + TEST(PullingSchemeTest, NEWvsOLD) { GenInfo access; access.l_max = 9; diff --git a/test/TestTools.hpp b/test/TestTools.hpp index b533674d..491599aa 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -46,7 +46,7 @@ inline bool compare(PixelData &mesh, const float *data, const float epsilon) } template -inline bool initFromZYXarray(PixelData &mesh, T *data) { +inline bool initFromZYXarray(PixelData &mesh, const T *data) { size_t dataIdx = 0; for (int z = 0; z < mesh.z_num; ++z) { for (int y = 0; y < mesh.y_num; ++y) { From 70543d2e6e3677616e6e0a192e809c9633d14d01 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 30 Nov 2023 11:28:44 +0100 Subject: [PATCH 33/80] TODO about some problems with edge case --- src/data_structures/APR/access/LinearAccess.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index 5f92c0ef..b170fd2c 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -226,6 +226,10 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); //edge case + // TODO: Don't know why we need that edge case but it would be good if it run properly + // For example 'genInfo->total_number_particles' is not set, maybe other values are not set either but + // it need to be investigated or this edge case removed (?) - if level_max() <= 2 then there are no many particles + // anyway so any code should be fast enough... if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; From dd3d448e0851807b1852b33cb4dd29dde4c65566 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 6 Dec 2023 17:22:25 +0100 Subject: [PATCH 34/80] Fixed test where out of range idx was given --- test/ComputeGradientTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index ca60fca3..0d822357 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -369,7 +369,7 @@ namespace { 0.0000000000, 0.2193282992, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.2930246294, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000 }; // put values in corners - m(1, 1, 4) = 1; + m(0, 1, 2) = 1; // Calculate bspline on CPU PixelData mCpu(m, true); From 1a112ecf9515524c24b90980bad7663fc13a0abe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 13 Dec 2023 18:28:38 +0100 Subject: [PATCH 35/80] Pulling Scheme tests (and OVPC on CPU) finished. --- src/data_structures/Mesh/PixelData.hpp | 10 + test/PullingSchemeCudaTest.cpp | 144 +++++ test/PullingSchemeTest.cpp | 799 +++++++++++++++---------- 3 files changed, 626 insertions(+), 327 deletions(-) diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 9b68458d..e0a037f0 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -513,6 +513,16 @@ public : * @return element @(y, x, z) */ T& operator()(int y, int x, int z) { + // TODO: In number of places during running tests below check shows problems. + // Investigate and try to fix. Such check in future probably should be permanent + // to discover all problems rather than hiding them. +#ifndef NDEBUG // with Cmake we need to use double neg. condition since there is not ifdef DEBUG defined :( + if ((y < 0 || y >= y_num) || (x < 0 || x >= x_num) || (z < 0 || z >= z_num)) { +// std::cerr << "Provided coordinates=(" << y << ", " << x << ", " << z; +// std::cerr << ") while PixelData size=(" << y_num << ", " << x_num << ", " << z_num << ")" << std::endl; +// throw std::runtime_error("Provided (y,x,z) coordinates are out of range!"); + } +#endif y = std::min(y, y_num-1); x = std::min(x, x_num-1); z = std::min(z, z_num-1); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index c956a53f..5ca6f3cc 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -5,9 +5,153 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" + + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(PullingSchemeTest, PullingScheme1D) { + + int values[] = {0,0,0,5, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + + PixelData levels(3,3,3, 0); + levels(2,2,2) = 11; + +// initFromZYXarray(levels, values); + levels.printMeshT(3, 1); + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + std::cout << "1\n"; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + std::cout << "2\n"; + LinearIterator it(linearAccess, gi); + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} + // TEST(PullingSchemeCudaTest, computeLevels) { // using ImgType = float; // const int maxLevel = 3; diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index 83d97366..e1347b1c 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,21 +8,19 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" -#include "algorithm/APRConverter.hpp" + namespace { - template - PixelData generateLevels(const PixelData &dimsMesh, int maxLevel) { - PixelData levels(dimsMesh, false); - for (int i = 0; i < levels.mesh.size(); ++i) { - levels.mesh[i] = ( i/2 ) % (maxLevel + 2); - } -// std::cout << "LEVELS: " << std::endl; - levels.printMesh(3, 0); - return levels; - } + // ================================================================================================================= + // ======== Some test helpers + // ================================================================================================================= + + /** + * Prints PCT + * @param particleCellTree + */ template void printParticleCellTree(const std::vector> &particleCellTree) { for (uint64_t l = 0; l < particleCellTree.size(); ++l) { @@ -32,381 +30,528 @@ namespace { } } - template - inline int compareParticleCellTrees(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (expected.mesh[i] < 8) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; + // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) + class LevelData { + public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... + }; + + /** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ + template + bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { + + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only + + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; + + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; } - if (expected.mesh[i] > 0) numOfParticles++; + else { + std::cout << "Error! Data at (" << r.y << "," << r.x << "," << r.z << ") expected = " << (int)r.expectedType << " got = " << (int)v << std::endl; + return false; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " Particles:" << numOfParticles << std::endl; - return cnt; - } - - // ------------------------------------------------------------------------ - - TEST(PullingSchemeTest, DeleteMeAfterDeevelopment) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + for (int level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } + } -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; - // PS values for above 'image': int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + return true; + } - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.01; - aprConverter.par.lambda = 0.1; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + /** + * Compare + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @return + */ + template + inline int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int cntGlobal = 0; + for (int level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; } + if (expected[level].mesh[i] > 0) numOfParticles++; } } + cntGlobal += cnt; + if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; + } + + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + auto l_max = aPS.pct_level_max(); + auto l_min = aPS.pct_level_min(); + +// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + + aPS.fill(l_max, levels); + PixelData levelsDS; + for (int l = l_max - 1; l >= l_min; l--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + aPS.fill(l, levelsDS); +// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; } - TEST(PullingSchemeTest, PullingScheme1D) { + // ================================================================================================================= + // ======== Pulling Scheme algorithm tests + // ================================================================================================================= + TEST(PullingSchemeTest, PullingScheme1D_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, - //int values[] = {4,4,1,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }; -// int values[] = {3,2,2,2, 2,2,1,1}; -// int values[] = {3,0,0,0, 0,0,0,0}; -// int values[] = {3,0,0,0, 0,0,0,0}; -// int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; - int values[] = {0,2,2,3, 4,5,6,7}; + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, PullingScheme1D_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; int len = sizeof(values)/sizeof(int); - PixelData levels(len ,1, 1); + PixelData levels(1, len, 1); // <-- X-dir initFromZYXarray(levels, values); - levels.printMeshT(3, 1); + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(dim.y * 2, dim.x, dim.z); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir - APRTimer t(true); + // Initialize all needed objects + APRTimer t(false); - t.start_timer("PS1"); + t.start_timer("PS - initialize with data"); PullingScheme ps; ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - ps.fill(l_max, levels); - std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - PixelData levelsDS; - for(int l = l_max - 1; l >= l_min; l--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l, levelsDS); - std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } - printParticleCellTree(ps.getParticleCellTree()); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); ps.pulling_scheme_main(); t.stop_timer(); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - std::cout << "1\n"; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - std::cout << "2\n"; - LinearIterator it(linearAccess, gi); - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } - TEST(PullingSchemeTest, Simple) { + TEST(PullingSchemeTest, PullingScheme1D_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; - // TODO: Investigate why OVPC fails if one of the dimension is equal to 1 - // Investigate why sub-dimension in printParticleCellTree is different in OVPC nad PS - gi.init(8, 1, 2); + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , - std::cout << gi << std::endl; + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; - PixelData levels = getRandInitializedMesh( - std::ceil(gi.org_dims[0]/2), - std::ceil(gi.org_dims[1]/2), - std::ceil(gi.org_dims[2]/2), - gi.l_max + 1); - PixelData levels2(levels, true); -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } -// levels.printMeshT(3, 1); + TEST(PullingSchemeTest, PullingScheme3D_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3); + levels(2, 2, 2) = 3; - APRTimer t(true); + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); - t.start_timer("PS1"); + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); PullingScheme ps; ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - ps.fill(l_max, levels); - PixelData levelsDS; - for(int l_ = l_max - 1; l_ >= l_min; l_--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_,levelsDS); - levels.swap(levelsDS); - } + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); ps.pulling_scheme_main(); t.stop_timer(); - t.start_timer("OVPC1"); - OVPC nps(gi, levels2); + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + // ================================================================================================================= + // ======== OVPC - Optimal Valid Particle Cell - alternative version of original Pulling Scheme algorithm + // ================================================================================================================= + TEST(PullingSchemeTest, OVPC_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); t.stop_timer(); - t.start_timer("OVPC2"); - nps.generateTree(); + t.start_timer("OVPC - compute"); + ps.generateTree(); t.stop_timer(); - std::cout << "----------OVPC:\n"; - printParticleCellTree(nps.getParticleCellTree()); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, - for (int l = l_min; l <= l_max; ++l) - compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } + TEST(PullingSchemeTest, OVPC_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); - TEST(PullingSchemeTest, NEWvsOLD) { - GenInfo access; - access.l_max = 9; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max); - access.org_dims[1] = std::pow(2, access.l_max); - access.org_dims[2] = std::pow(2, access.l_max); - int l = access.l_max - 1; + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); + // Initialize all needed objects + APRTimer t(false); -// levels.printMeshT(3, 1); + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); - APRTimer t(true); + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels); - PixelData levelsDS; - for(int l_ = l_max - 1; l_ >= l_min; l_--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_,levelsDS); - levels.swap(levelsDS); - } - ps.pulling_scheme_main(); - t.stop_timer(); + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, OVPC_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + // Initialize all needed objects + APRTimer t(false); - t.start_timer("OVPC1"); - OVPC nps(access, levels2); + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); t.stop_timer(); - t.start_timer("OVPC2"); - nps.generateTree(); + t.start_timer("OVPC - compute"); + ps.generateTree(); t.stop_timer(); -// printParticleCellTree(nps.getParticleCellTree()); -// printParticleCellTree(ps.getParticleCellTree()); + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , - for (l = l_min; l <= l_max; ++l) - compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } -// TEST(PullingSchemeTest, Init) { -// -// GenInfo access; -// access.l_max = 5; -// access.l_min = 1; -// access.org_dims[0] = 32; -// access.org_dims[1] = 1; -// access.org_dims[2] = 1; -// -// PullingScheme ps; -// ps.initialize_particle_cell_tree(access); -// std::vector> &pctree = ps.getParticleCellTree(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Initialized tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -// // TEST: check if zeroed and correct number of levels -// ASSERT_EQ(access.l_max, pctree.size()); // all levels [0, access.level_max - 1] -// for (int l = 0; l < pctree.size(); ++l) { -// auto &tree = pctree[l]; -// for (auto &e : tree.mesh) { -// ASSERT_EQ(0, e); -// } -// } -// -// // Generate mesh with test levels -// PixelData levels(pctree.back(), false);// = generateLevels(pctree[access.l_max - 1], access.l_max); -//// float values[] = {4, 1, 1, 1, 1, 1, 1, 2}; -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); -// -// -// OVPC nps(access, levels); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; -// printParticleCellTree(nps.getParticleCellTree()); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; -// nps.generateTree(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; -// printParticleCellTree(nps.getParticleCellTree()); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; -// // Fill particle cell tree with levels -// int l_max = access.l_max - 1; -// int l_min = access.l_min; -// ps.fill(l_max, levels); -// -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; -// levels.printMeshT(3,0); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; -// -// PixelData levelsDS; -// for(int l_ = l_max - 1; l_ >= l_min; l_--){ -// //down sample the resolution level k, using a max reduction -// downsample(levels, levelsDS, -// [](const float &x, const float &y) -> float { return std::max(x, y); }, -// [](const float &x) -> float { return x; }, true); -// levelsDS.printMeshT(3, 0); -// ps.fill(l_,levelsDS); -// levelsDS.printMeshT(3,0); -// levels.swap(levelsDS); -// } -// -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Filled tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -//// ps.fill_neighbours(l_max); -//// pctree[l_max].printMesh(3, 0); -// -// -// ps.pulling_scheme_main(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> MAIN tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -// access.initialize_structure_from_particle_cell_tree(false, ps.getParticleCellTree()); -// std::cout << "NUM OF PARTICLES: " << access.get_total_number_particles() << std::endl; -// -// -// APRIterator apr_iterator(access); -// std::cout << "Total number of particles: " << apr_iterator.total_number_particles() << std::endl; -// -// int prev = 0; -// for (unsigned int level = apr_iterator.level_min(); level <= apr_iterator.level_max(); ++level) { -// std::cout << "Level: " << level << std::endl; -// int w = (int) (std::pow(2, 5-level) * 3); -// for (int z = 0; z < apr_iterator.spatial_index_z_max(level); ++z) { -// for (int x = 0; x < apr_iterator.spatial_index_x_max(level); ++x) { -// for (apr_iterator.set_new_lzx(level, z, x); apr_iterator.global_index() < apr_iterator.end_index; apr_iterator.set_iterator_to_particle_next_particle()) { -// for (int i = prev; i < apr_iterator.y(); ++i ) std::cout << std::setw(w) << "."; -// std::cout << std::setw(w) << apr_iterator.y(); -// prev = apr_iterator.y() + 1; -// } -// for (int pp = prev; pp < apr_iterator.spatial_index_y_max(level); ++pp) -// std::cout << std::setw(w) << "."; -// -// prev = 0; -// std::cout << std::endl; -// } -// std::cout << std::endl; -// } -// } -// -// } + TEST(PullingSchemeTest, OVPC_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3); + levels(2, 2, 2) = 3; + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + + // ================================================================================================================= + // ======== PS vs OVPC + // ================================================================================================================= + + TEST(PullingSchemeTest, PSvsOVPC) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 257, 199); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; + } + PixelData levelsOVPC(levels, true); // just copy 'levels' + APRTimer t(false); + + // Run test methods and compare results + t.start_timer("OVPC - init"); + OVPC nps(gi, levelsOVPC); + t.stop_timer(); + t.start_timer("OVPC compute"); + nps.generateTree(); + t.stop_timer(); + + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), nps.getParticleCellTree()), 0); + } } From 64ca641a49da879a66ce04a750388d9ca1cf1b5e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 14 Dec 2023 09:25:08 +0100 Subject: [PATCH 36/80] Fixes for tests --- test/PullingSchemeCudaTest.cpp | 62 ++++++++++++++++++++++++++++++++++ test/PullingSchemeTest.cpp | 12 +++---- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5ca6f3cc..afeb59f1 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -9,7 +9,69 @@ #include "TestTools.hpp" +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} +/** + * Compare + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @return + */ +template +int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int cntGlobal = 0; + for (size_t level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected[level].mesh[i] > 0) numOfParticles++; + } + } + cntGlobal += cnt; + if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; +} +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + auto l_max = aPS.pct_level_max(); + auto l_min = aPS.pct_level_min(); + +// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + + aPS.fill(l_max, levels); + PixelData levelsDS; + for (int l = l_max - 1; l >= l_min; l--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + aPS.fill(l, levelsDS); +// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); + } +} TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { // TODO: delete me after development diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index e1347b1c..c7c66b63 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -64,12 +64,12 @@ namespace { v = AlreadyCheckedMark; } else { - std::cout << "Error! Data at (" << r.y << "," << r.x << "," << r.z << ") expected = " << (int)r.expectedType << " got = " << (int)v << std::endl; + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; return false; } } - for (int level = 0; level < aPCT.size(); level++) { + for (size_t level = 0; level < aPCT.size(); level++) { auto &d = aPCT[level]; auto y_num = d.y_num; auto x_num = d.x_num; @@ -100,9 +100,9 @@ namespace { * @return */ template - inline int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { int cntGlobal = 0; - for (int level = 0; level < expected.size(); level++) { + for (size_t level = 0; level < expected.size(); level++) { int cnt = 0; int numOfParticles = 0; for (size_t i = 0; i < expected[level].mesh.size(); ++i) { @@ -271,7 +271,7 @@ namespace { TEST(PullingSchemeTest, PullingScheme3D_smallCube) { // Prepare input data for PS - PixelData levels(3, 3, 3); + PixelData levels(3, 3, 3, 0); levels(2, 2, 2) = 3; // Prepare GenInfo structure - @@ -450,7 +450,7 @@ namespace { TEST(PullingSchemeTest, OVPC_smallCube) { // Prepare input data for PS - PixelData levels(3, 3, 3); + PixelData levels(3, 3, 3, 0); levels(2, 2, 2) = 3; // Prepare GenInfo structure - From 9f31bfda2863740ebcb331060e34e1b81abd88cb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 9 Jan 2024 11:27:04 +0100 Subject: [PATCH 37/80] Fixed OVPC - clamping values of input levels is necessary --- src/algorithm/OVPC.cu | 53 ++++++++++--------- src/algorithm/OVPC.h | 3 +- .../APR/access/RandomAccess.hpp | 4 +- src/data_structures/Mesh/ImagePatch.hpp | 2 +- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index f568212b..9794df2b 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -1,39 +1,28 @@ #include "PullingSchemeCuda.hpp" #include -#include -//#include -#include #include "misc/CudaTools.cuh" #include "data_structures/Mesh/downsample.cuh" +#include "algorithm/OVPC.h" -namespace { - using ElementType = uint8_t; - static constexpr int BIT_SHIFT = 6; - static constexpr ElementType OVPC_SEED = 1; - static constexpr ElementType OVPC_BOUNDARY = 2; - static constexpr ElementType OVPC_FILLER = 3; - - static constexpr ElementType SEED_MASK = OVPC_SEED << BIT_SHIFT; - static constexpr ElementType BOUNDARY_MASK = OVPC_BOUNDARY << BIT_SHIFT; - static constexpr ElementType FILLER_MASK = OVPC_FILLER << BIT_SHIFT; - static constexpr ElementType MASK = 0x03 << BIT_SHIFT; -} template -__global__ void copy1D(const T *input, S *output, size_t length) { +__global__ void copyAndClampLevels(const T *input, S *output, size_t length, int levelMin, int levelMax) { size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; if (idx < length) { - output[idx] = input[idx]; + T v = input[idx]; + if (v > levelMax) v = levelMax; + if (v < levelMin) v = levelMin; + output[idx] = v; } } template -void runCopy1D(T *inputData, S *outputData, size_t lenght, cudaStream_t aStream) { +void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int levelMin, int levelMax, cudaStream_t aStream) { dim3 threadsPerBlock(128); dim3 numBlocks((lenght + threadsPerBlock.x - 1)/threadsPerBlock.x); - copy1D<<>>(inputData, outputData, lenght); + copyAndClampLevels<<>>(inputData, outputData, lenght, levelMin, levelMax); }; @@ -57,7 +46,7 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { const size_t idx = z * xLen * yLen + x * yLen + y; - T currentLevel = ~MASK & data[idx]; + T currentLevel = ~OVPC::MASK & data[idx]; if (currentLevel > level) { ok = false; break; } else if (currentLevel == level) neig = true; } @@ -66,9 +55,9 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev if (ok) { const size_t idx = zi * xLen * yLen + xi * yLen + yi; T status = data[idx]; - if (status == level) data[idx] |= SEED_MASK; - else if (neig) data[idx] |= BOUNDARY_MASK; - else data[idx] |= FILLER_MASK; + if (status == level) data[idx] |= OVPC::SEED; + else if (neig) data[idx] |= OVPC::BOUNDARY; + else data[idx] |= OVPC::FILLER; } } @@ -103,11 +92,11 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { size_t children_index = z * xLenc * yLenc + x * yLenc + y; - child[children_index] = status >= (OVPC_SEED << BIT_SHIFT) ? 0 : child[children_index] >> BIT_SHIFT; + child[children_index] = status >= (OVPC::OVPC_SEED << OVPC::BIT_SHIFT) ? 0 : child[children_index] >> OVPC::BIT_SHIFT; } } } - if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> BIT_SHIFT; + if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; } template @@ -124,9 +113,19 @@ template void computeOVPC(const PixelData&, PixelData&, template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { + + ScopedCudaMemHandler, H2D> in(input); ScopedCudaMemHandler, D2H> mem(output); + + CudaTimer t(true, "OVPCCUDA"); + + t.start_timer("wait"); + waitForCuda(); + t.stop_timer(); + + t.start_timer("ALL"); // TODO: This is not needed later - just for having clear debug //cudaMemset(mem.get(), 0, mem.getNumOfBytes()); @@ -157,7 +156,7 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, zDS = ceil(zDS/2.0); } - runCopy1D(in.get(), levels[levelMax], in.getSize(), 0); + runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); for (int l = levelMax - 1; l >= levelMin; --l) { runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0); @@ -172,4 +171,6 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, for (int l = levelMax - 1; l >= levelMin; --l) { runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); } + waitForCuda(); + t.stop_timer(); }; diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index 6925f325..e0c8a67b 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -14,6 +14,8 @@ class OVPC { + +public: // Element big enouth to keep all the levels + 2 highest bits for type // for uint8_t we have [ 2 bit - type(empty, seed, boundary, filler) | 6 bit - level(0-63) ] using ElementType = uint8_t; @@ -31,7 +33,6 @@ class OVPC { int iLevelMin; std::vector> iParticleCellTree; -public: template OVPC(const GenInfo &aAprAccess, const PixelData &aInputLevels) { // Level Max is one less since we are working on downsampled version diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp index 0daf7a54..aa8f67bc 100644 --- a/src/data_structures/APR/access/RandomAccess.hpp +++ b/src/data_structures/APR/access/RandomAccess.hpp @@ -1210,7 +1210,7 @@ inline void RandomAccess::initialize_tree_access(RandomAccess& APROwn_access, st } -void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ +inline void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ uint64_t cumsum = 0; APRTimer apr_timer(false); @@ -1423,7 +1423,7 @@ inline void RandomAccess::initialize_tree_access_sparse(RandomAccess& APROwn_acc } -void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { +inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { // // Initialize the new structure; // diff --git a/src/data_structures/Mesh/ImagePatch.hpp b/src/data_structures/Mesh/ImagePatch.hpp index a249efdd..01d27fd3 100644 --- a/src/data_structures/Mesh/ImagePatch.hpp +++ b/src/data_structures/Mesh/ImagePatch.hpp @@ -38,7 +38,7 @@ struct ImagePatch { }; -void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { +inline void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { patch.z_begin_global = z_begin_global; patch.x_begin_global = x_begin_global; patch.y_begin_global = y_begin_global; From 2707207b90cbeb6a11425d3b1ada1b65b8566de8 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 5 Feb 2024 16:18:14 +0100 Subject: [PATCH 38/80] Updated OVPC (PS) for CUDA - now it gives correct ans same results as PS CPU ver. --- src/algorithm/LocalParticleCellSet.hpp | 3 + src/algorithm/OVPC.cu | 78 ++++-- src/algorithm/OVPC.h | 4 +- src/algorithm/PullingScheme.hpp | 35 ++- src/algorithm/PullingSchemeCuda.hpp | 3 +- test/PullingSchemeCudaTest.cpp | 338 ++++++++++++++++++------- test/PullingSchemeTest.cpp | 23 +- 7 files changed, 339 insertions(+), 145 deletions(-) diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp index 7935076b..f20e08c1 100644 --- a/src/algorithm/LocalParticleCellSet.hpp +++ b/src/algorithm/LocalParticleCellSet.hpp @@ -49,6 +49,9 @@ inline int __builtin_clz(unsigned int x) #endif +#include "algorithm/PullingScheme.hpp" +#include "algorithm/PullingSchemeSparse.hpp" + class LocalParticleCellSet { public: diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 9794df2b..070c4d81 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -27,7 +27,7 @@ void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int level template -__global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { +__global__ void firstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -40,39 +40,38 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev int zmin = zi > 0 ? zi - 1 : 0; int zmax = zi < zLen - 1 ? zi + 1 : zLen - 1; - bool ok = true; - bool neig = false; + bool hasNeighHigherLevel = false; + bool hasNeighSameLevel = false; for (int z = zmin; z <= zmax; ++z) { for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { const size_t idx = z * xLen * yLen + x * yLen + y; T currentLevel = ~OVPC::MASK & data[idx]; - if (currentLevel > level) { ok = false; break; } - else if (currentLevel == level) neig = true; + if (currentLevel > level) { hasNeighHigherLevel = true; break; } + else if (currentLevel == level) hasNeighSameLevel = true; } } } - if (ok) { + if (!hasNeighHigherLevel) { const size_t idx = zi * xLen * yLen + xi * yLen + yi; T status = data[idx]; if (status == level) data[idx] |= OVPC::SEED; - else if (neig) data[idx] |= OVPC::BOUNDARY; + else if (hasNeighSameLevel) data[idx] |= OVPC::BOUNDARY; else data[idx] |= OVPC::FILLER; } } template -void runOneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { +void runFirstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); -// dim3 numBlocks((xLen * yLen * zLen + threadsPerBlock.x - 1)/threadsPerBlock.x); - oneLevel<<>>(data, xLen, yLen, zLen, level); + firstStep<<>>(data, xLen, yLen, zLen, level); }; template -__global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax) { +__global__ void secondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMin) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -96,16 +95,16 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t } } } - if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; + if (isLevelMin) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; } template -void runSecondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { +void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); - secondPhase<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); + secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; // explicit instantiation of handled types @@ -114,6 +113,8 @@ template void computeOVPC(const PixelData&, PixelData&, template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { + // TODO: Depending on implementation of computing particles (next step after OVPC) some port of this method + // might be useful. Leaving it here rigtht now just in case. If not needed in next steps DELETE IT. ScopedCudaMemHandler, H2D> in(input); ScopedCudaMemHandler, D2H> mem(output); @@ -156,6 +157,7 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, zDS = ceil(zDS/2.0); } + runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); for (int l = levelMax - 1; l >= levelMin; --l) { @@ -165,12 +167,54 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runOneLevel(levels[l], xSize[l], ySize[l], zSize[l], l, 0); + runFirstStep(levels[l], xSize[l], ySize[l], zSize[l], l, 0); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); + runSecondStep(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); } waitForCuda(); t.stop_timer(); -}; +} + +// explicit instantiation of handled types +template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); + +/** + * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). + * @tparam T - type of input levels + * @tparam S - type of output Particle Cell Tree + * @param input - input levels computed in earlier stages + * @param pct - Particle Cell Tree - as input is used for dimensions of each level, will be filled with computed + * Pulling Scheme as a output + * @param levelMin - min level of APR + * @param levelMax - max level of APR + */ +template +void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax) { + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps + ScopedCudaMemHandler, H2D> in(input); + std::vector, D2H>> w; + for (int l = 0; l <= levelMax; ++l) { + w.push_back(std::move(ScopedCudaMemHandler, D2H>(pct[l]))); + } + + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in.get(), w[levelMax].get(), in.getSize(), levelMin, levelMax, 0); + + // Downsample with max reduction to levelMin to fill the rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(w[l + 1].get(), w[l].get(), pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, 0); + } + + // ================== Phase 1 - top to down + for (int l = levelMin; l <= levelMax; ++l) { + runFirstStep(w[l].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, l, 0); + } + // ================== Phase 1 - down to top + for (int l = levelMax - 1; l >= levelMin; --l) { + runSecondStep(w[l].get(), w[l+1].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, l == levelMin, 0); + } +} diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index e0c8a67b..f55bfee3 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -44,8 +44,8 @@ class OVPC { iParticleCellTree[iLevelMax].init(aInputLevels.y_num, aInputLevels.x_num, aInputLevels.z_num); fillLevel(iLevelMax, aInputLevels); - // Downsample with max reduction to levelMin to fill the rest of the tree - for(int level = iLevelMax - 1; level >= iLevelMin; --level) { + // Downsample with max reduction to levelMin to fill rest of the tree + for (int level = iLevelMax - 1; level >= iLevelMin; --level) { downsample(iParticleCellTree[level + 1], iParticleCellTree[level], [](const float &x, const float &y) -> float { return std::max(x, y); }, [](const float &x) -> float { return x; }, true); diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp index 58ae9ee2..c6756df6 100644 --- a/src/algorithm/PullingScheme.hpp +++ b/src/algorithm/PullingScheme.hpp @@ -51,13 +51,13 @@ for(jn = j * 2; jn < j * 2 + children_boundaries[0]; jn++) \ class PullingScheme { - double powr(uint64_t num,uint64_t pow2){ +public: + + static double powr(uint64_t num,uint64_t pow2){ //return (uint64_t) std::round(std::pow(num,pow2)); return std::round(pow(num,pow2)); } - -public: template void fill(float k, const PixelData &input); @@ -65,6 +65,7 @@ class PullingScheme { void fill_patch(float level, const PixelData &input, ImagePatch& patch); void pulling_scheme_main(); + static std::vector> generateParticleCellTree(const GenInfo &aprInfo); void initialize_particle_cell_tree(const GenInfo &aprInfo); std::vector>& getParticleCellTree() { return particle_cell_tree; } @@ -86,6 +87,25 @@ class PullingScheme { int l_max; }; + +inline std::vector> PullingScheme::generateParticleCellTree(const GenInfo &aprInfo) { + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + std::vector> pct; + pct.resize(l_max + 1); + + for (int l = l_min; l <= l_max; ++l) { + pct[l].initWithValue(ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)), + EMPTY); + + } + + return pct; +} + /** * Initializes particle_cell_tree up to level (max - 1) */ @@ -93,14 +113,7 @@ inline void PullingScheme::initialize_particle_cell_tree(const GenInfo &aprInfo) l_max = aprInfo.l_max - 1; l_min = aprInfo.l_min; - particle_cell_tree.resize(l_max + 1); - - for (int l = l_min; l <= l_max; ++l) { - particle_cell_tree[l].initWithValue(ceil(aprInfo.org_dims[0] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[1] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[2] / powr(2.0, l_max - l + 1)), - EMPTY); - } + particle_cell_tree = generateParticleCellTree(aprInfo); } /** diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 79a23560..f98c0883 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -12,6 +12,7 @@ using TreeElementType = uint8_t; template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax); - +template +void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index afeb59f1..20eae2e1 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -6,9 +6,72 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" #include "algorithm/APRConverter.hpp" +#include "algorithm/LocalParticleCellSet.hpp" #include "TestTools.hpp" + +// Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) +class LevelData { +public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... +}; + +/** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ +template +bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { + + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only + + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; + + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; + } + else { + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; + return false; + } + } + + for (size_t level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } + } + + return true; +} + /** * Prints PCT * @param particleCellTree @@ -56,24 +119,11 @@ int compareParticleCellTrees(const std::vector> &expected, const st template void fillPS(PullingScheme &aPS, PixelData &levels) { - auto l_max = aPS.pct_level_max(); - auto l_min = aPS.pct_level_min(); - -// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - - aPS.fill(l_max, levels); - PixelData levelsDS; - for (int l = l_max - 1; l >= l_min; l--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - aPS.fill(l, levelsDS); -// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } -TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { // TODO: delete me after development // Full 'get apr' pipeline to test imp. on different stages // Useful during debugging and can be removed once finished @@ -154,12 +204,14 @@ TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { -TEST(PullingSchemeTest, PullingScheme1D) { - +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished int values[] = {0,0,0,5, 0,0,0,0}; int len = sizeof(values)/sizeof(int); - PixelData levels(3,3,3, 0); + PixelData levels(3,3,3, 0); levels(2,2,2) = 11; // initFromZYXarray(levels, values); @@ -171,7 +223,7 @@ TEST(PullingSchemeTest, PullingScheme1D) { gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. std::cout << gi << std::endl; - APRTimer t(true); + APRTimer t(false); t.start_timer("PS1"); PullingScheme ps; @@ -214,83 +266,177 @@ TEST(PullingSchemeTest, PullingScheme1D) { std::cout << std::endl; } -// TEST(PullingSchemeCudaTest, computeLevels) { -// using ImgType = float; -// const int maxLevel = 3; -// const float relError = 0.1; -// -// PixelData grad = getRandInitializedMesh(10, 20, 33); -// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); -// -// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); -// PixelData elo(localIntensityScaleCpu, true); -// APRTimer timer(true); -// -// timer.start_timer("CPU Levels"); -// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); -// timer.stop_timer(); -// -// timer.start_timer("GPU Levels"); -// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); -// timer.stop_timer(); -// -// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); -// } - - - -TEST(PullingSchemeCudaTest, DS) { - GenInfo access; - access.l_max = 11; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max)/2; - access.org_dims[1] = std::pow(2, access.l_max)/2; - access.org_dims[2] = std::pow(2, access.l_max); - - - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); - - // PixelData levels(16,1,1); - // float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; - // initFromZYXarray(levels, values); - - APRTimer t(true); - if (false) { - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels2); - PixelData levelsDS; - for (int l_ = l_max - 1; l_ >= l_min; l_--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_, levelsDS); - levels2.swap(levelsDS); - } - t.stop_timer(); - } - { - t.start_timer("CUDA"); - int levelMax = access.l_max - 1; - int levelMin = access.l_min; - PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); - std::cout << levels << std::endl; - // std::cout << ds << std::endl; - computeOVPC(levels, ds, levelMin, levelMax); - // ds.printMeshT(3,1); - t.stop_timer(); - } - { - t.start_timer("OVPC1"); - OVPC nps(access, levels); - nps.generateTree(); - t.stop_timer(); - // printParticleCellTree(nps.getParticleCellTree()); + +TEST(PullingSchemeTest, PSvsOVPCCUDA) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 257, 199); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; } + PixelData levelsOVPC(levels, true); // just copy 'levels' + PixelData levelsPS(levels, true); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levelsPS); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // Run test methods and compare results + t.start_timer("OVPCCUDA - init"); + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levelsOVPC, pct, levelMin, levelMax); + t.stop_timer(); + + // -------------- Verify result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); +} + +TEST(PullingSchemeTest, OVPCCUDA_Ydir) { + // Prepare input data for PS + float values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, + + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , + + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); } int main(int argc, char **argv) { diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index c7c66b63..be922d90 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,7 +8,7 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" - +#include "algorithm/LocalParticleCellSet.hpp" namespace { @@ -126,21 +126,8 @@ namespace { template void fillPS(PullingScheme &aPS, PixelData &levels) { - auto l_max = aPS.pct_level_max(); - auto l_min = aPS.pct_level_min(); - -// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - - aPS.fill(l_max, levels); - PixelData levelsDS; - for (int l = l_max - 1; l >= l_min; l--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - aPS.fill(l, levelsDS); -// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } // ================================================================================================================= @@ -157,7 +144,7 @@ namespace { // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(2 * dim.y, dim.x, dim.z); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir // Initialize all needed objects APRTimer t(false); @@ -345,7 +332,7 @@ namespace { // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(2 * dim.y, dim.x, dim.z); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir // Initialize all needed objects APRTimer t(false); From 3cb4529d73a015d8f135b7fae5973c9aac15363c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 16 Feb 2024 16:23:48 +0100 Subject: [PATCH 39/80] PullingSchemeCudaTest finished, added init file for LinearAcccess test --- src/algorithm/LocalParticleCellSet.hpp | 1 + .../APR/access/LinearAccess.hpp | 23 +-- test/CMakeLists.txt | 1 + test/LinearAccessTest.cpp | 185 ++++++++++++++++++ test/PullingSchemeCudaTest.cpp | 148 +------------- 5 files changed, 197 insertions(+), 161 deletions(-) create mode 100644 test/LinearAccessTest.cpp diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp index f20e08c1..f834805a 100644 --- a/src/algorithm/LocalParticleCellSet.hpp +++ b/src/algorithm/LocalParticleCellSet.hpp @@ -51,6 +51,7 @@ inline int __builtin_clz(unsigned int x) #include "algorithm/PullingScheme.hpp" #include "algorithm/PullingSchemeSparse.hpp" +#include "io/TiffUtils.hpp" class LocalParticleCellSet { diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index b170fd2c..b00a02b0 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -226,40 +226,31 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); //edge case - // TODO: Don't know why we need that edge case but it would be good if it run properly - // For example 'genInfo->total_number_particles' is not set, maybe other values are not set either but - // it need to be investigated or this edge case removed (?) - if level_max() <= 2 then there are no many particles - // anyway so any code should be fast enough... if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; //just initialize full resolution const auto level_start = level_xz_vec[level_max()]; - uint64_t counter = 0; + uint64_t particleCounter = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { const size_t offset_pc_data = z * x_num(level_max()) + x; - for (int y = 0; y < y_num(level_max()); ++y) { - - counter++; - } - xz_end_vec[level_start + offset_pc_data] = counter; + particleCounter += y_num(level_max()); + xz_end_vec[level_start + offset_pc_data] = particleCounter; } } - y_vec.resize(counter); - counter = 0; + genInfo->total_number_particles = xz_end_vec.back(); + y_vec.resize(genInfo->total_number_particles); + size_t idx = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { - for (int y = 0; y < y_num(level_max()); ++y) { - y_vec[counter] = y; - counter++; + y_vec[idx++] = y; } } } - return; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6ac7e381..8df65d9d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ buildTarget(testComputeGradient ComputeGradientTest.cpp) buildTarget(testLocalIntensityScale LocalIntensityScaleTest.cpp) buildTarget(testPullingScheme PullingSchemeTest.cpp) buildTarget(testAPRParameters APRParametersTest.cpp) +buildTarget(testLinearAccess LinearAccessTest.cpp) #APR GPU Tests if(APR_USE_CUDA) diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp new file mode 100644 index 00000000..766307b9 --- /dev/null +++ b/test/LinearAccessTest.cpp @@ -0,0 +1,185 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/LocalParticleCellSet.hpp" +#include "algorithm/APRConverter.hpp" + +#include "TestTools.hpp" + +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); +} + +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished +// int values[] = {0,0,0,5, 0,0,0,0}; +// int len = sizeof(values)/sizeof(int); + + PixelData levels(2, 2, 2, 0); + levels(0,0,0) = 4; + +// initFromZYXarray(levels, values); + std::cout << "---------------\n"; + levels.printMeshT(3, 1); + std::cout << "---------------\n"; + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(false); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + + std::cout << gi << std::endl; + auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << " "; std::cout << std::endl; }; + prt(linearAccess.y_vec); + prt(linearAccess.xz_end_vec); + prt(linearAccess.level_xz_vec); + + LinearIterator it(linearAccess, gi); + for (int l = 0; l <= 3; l++) { + std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; + } + std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 20eae2e1..53eec162 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -5,12 +5,10 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" -#include "algorithm/APRConverter.hpp" #include "algorithm/LocalParticleCellSet.hpp" #include "TestTools.hpp" - // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) class LevelData { public: @@ -84,6 +82,7 @@ void printParticleCellTree(const std::vector> &particleCellTree) { tree.printMeshT(3,0); } } + /** * Compare * @param expected - expected levels @@ -119,153 +118,12 @@ int compareParticleCellTrees(const std::vector> &expected, const st template void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; -} - - - -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished - int values[] = {0,0,0,5, 0,0,0,0}; - int len = sizeof(values)/sizeof(int); - - PixelData levels(3,3,3, 0); - levels(2,2,2) = 11; - -// initFromZYXarray(levels, values); - levels.printMeshT(3, 1); - - GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; - - APRTimer t(false); - - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - - fillPS(ps, levels); - - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; - - ps.pulling_scheme_main(); - t.stop_timer(); - - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - std::cout << "1\n"; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - std::cout << "2\n"; - LinearIterator it(linearAccess, gi); - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; -} +// ------------------------------------------------------------------------------------------------------------------------------------------- TEST(PullingSchemeTest, PSvsOVPCCUDA) { // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC From 027e52aaac7068a0457a46d6f3c665d2136efe91 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Feb 2024 18:29:49 +0100 Subject: [PATCH 40/80] Finished LinearAccess tests (for linear structure only), added draft for LinearAccessCuda --- test/CMakeLists.txt | 1 + test/LinearAccessCudaTest.cpp | 182 ++++++++++++++++++ test/LinearAccessTest.cpp | 339 +++++++++++++++++++--------------- test/TestTools.hpp | 6 +- 4 files changed, 381 insertions(+), 147 deletions(-) create mode 100644 test/LinearAccessCudaTest.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8df65d9d..d3377fb0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,6 +20,7 @@ if(APR_USE_CUDA) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp) + buildTarget(testLinearAccessCuda LinearAccessCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp new file mode 100644 index 00000000..bae93233 --- /dev/null +++ b/test/LinearAccessCudaTest.cpp @@ -0,0 +1,182 @@ +#include + +#include "algorithm/LocalParticleCellSet.hpp" +#include "algorithm/PullingScheme.hpp" +#include "algorithm/APRConverter.hpp" + +#include "TestTools.hpp" + + +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); +} + +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} + +TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished +// int values[] = {0,0,0,5, 0,0,0,0}; +// int len = sizeof(values)/sizeof(int); + + PixelData levels(3, 4,4, 0); + levels(0,0,0) = 4; + +// initFromZYXarray(levels, values); + std::cout << "---------------\n"; + levels.printMeshT(3, 1); + std::cout << "---------------\n"; + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(false); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + + std::cout << gi << std::endl; + auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; + prt(linearAccess.y_vec); + prt(linearAccess.xz_end_vec); + prt(linearAccess.level_xz_vec); + + LinearIterator it(linearAccess, gi); + for (int l = 0; l <= 3; l++) { + std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; + } + std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} \ No newline at end of file diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp index 766307b9..00ef84be 100644 --- a/test/LinearAccessTest.cpp +++ b/test/LinearAccessTest.cpp @@ -2,181 +2,230 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/LocalParticleCellSet.hpp" -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" -template -void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); - LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); -} /** - * Prints PCT - * @param particleCellTree + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * @return Particle Cell Tree with values */ -template -void printParticleCellTree(const std::vector> &particleCellTree) { - for (uint64_t l = 0; l < particleCellTree.size(); ++l) { - auto &tree = particleCellTree[l]; -// std::cout << "-- level = " << l << ", " << tree << std::endl; - tree.printMeshT(3,0); - } -} +auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + + int l = aprInfo.l_min; -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } + for (auto &level : levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; + + return pct; +} + +TEST(LinearAccessTest, optimizationForSmallLevels) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {{1, 2, 3, 4}}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); } +TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(16, 1, 1); + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished -// int values[] = {0,0,0,5, 0,0,0,0}; -// int len = sizeof(values)/sizeof(int); + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; - PixelData levels(2, 2, 2, 0); - levels(0,0,0) = 4; + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); -// initFromZYXarray(levels, values); - std::cout << "---------------\n"; - levels.printMeshT(3, 1); - std::cout << "---------------\n"; + // ---- Verify output + std::vector expected_y_vec = {2, 3, 1, 2, 3, 0, 1}; + std::vector expected_xz_end_vec = {0, 0, 2, 5, 7}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; + gi.init(16, 1, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; - APRTimer t(false); + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + // ---- Verify output + std::vector expected_y_vec = {2, 3, 2, 3, 0, 1, 2, 3}; + std::vector expected_xz_end_vec = {0, 0, 2, 4, 8}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; - fillPS(ps, levels); + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; +TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { - ps.pulling_scheme_main(); - t.stop_timer(); + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); LinearAccess linearAccess; linearAccess.genInfo = &gi; APRParameters par; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + par.neighborhood_optimization = true; - std::cout << gi << std::endl; - auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << " "; std::cout << std::endl; }; - prt(linearAccess.y_vec); - prt(linearAccess.xz_end_vec); - prt(linearAccess.level_xz_vec); + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); - LinearIterator it(linearAccess, gi); - for (int l = 0; l <= 3; l++) { - std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; - } - std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); } int main(int argc, char **argv) { diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 491599aa..2baa2369 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -126,14 +126,16 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp } for (size_t i = 0; i < expected.size(); ++i) { - if (std::abs(expected[i] - tested[i]) > maxError) { + if (std::abs((double)(expected[i] - tested[i])) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; } cnt++; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + } return cnt; } From e83b9527e2dec702776a71e00ffff6d81409b568 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 23 Feb 2024 10:25:53 +0100 Subject: [PATCH 41/80] Check also total_number_particles in LinearAccess test --- test/LinearAccessCudaTest.cpp | 16 ++++++++++------ test/LinearAccessTest.cpp | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index bae93233..24e961fd 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -26,7 +26,7 @@ void printParticleCellTree(const std::vector> &particleCellTree) { } } -TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { +TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { // TODO: delete me after development // Full 'get apr' pipeline to test imp. on different stages // Useful during debugging and can be removed once finished @@ -106,15 +106,14 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { } - -TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { +TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { // TODO: delete me after development // Runs PS to test imp. on different stages // Useful during debugging and can be removed once finished // int values[] = {0,0,0,5, 0,0,0,0}; // int len = sizeof(values)/sizeof(int); - PixelData levels(3, 4,4, 0); + PixelData levels(3, 1, 1, 0); levels(0,0,0) = 4; // initFromZYXarray(levels, values); @@ -153,7 +152,7 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { LinearAccess linearAccess; linearAccess.genInfo = &gi; APRParameters par; - par.neighborhood_optimization = false; + par.neighborhood_optimization = true; linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); std::cout << gi << std::endl; @@ -179,4 +178,9 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { } } std::cout << std::endl; -} \ No newline at end of file +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp index 00ef84be..b6c67db8 100644 --- a/test/LinearAccessTest.cpp +++ b/test/LinearAccessTest.cpp @@ -58,6 +58,8 @@ TEST(LinearAccessTest, optimizationForSmallLevels) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { @@ -86,6 +88,8 @@ TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { @@ -114,6 +118,8 @@ TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { @@ -142,6 +148,8 @@ TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { @@ -170,6 +178,8 @@ TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { @@ -198,6 +208,8 @@ TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { @@ -226,6 +238,8 @@ TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } int main(int argc, char **argv) { From 2cc5bcabd534274477f0a47ab69b50e3453d244f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 2 Aug 2024 15:33:24 +0200 Subject: [PATCH 42/80] LinearAccessCuda implemented (it is not used yet in CUDA pipeline) --- CMakeLists.txt | 1 + src/algorithm/PullingScheme.hpp | 7 + .../APR/access/LinearAccess.hpp | 39 +- .../APR/access/LinearAccessCuda.cu | 593 ++++++++++++++++++ .../APR/access/LinearAccessCuda.hpp | 17 + src/data_structures/Mesh/PixelData.hpp | 15 +- test/LinearAccessCudaTest.cpp | 227 ++++++- test/PullingSchemeCudaTest.cpp | 33 - test/PullingSchemeTest.cpp | 33 - test/TestTools.hpp | 40 +- 10 files changed, 905 insertions(+), 100 deletions(-) create mode 100644 src/data_structures/APR/access/LinearAccessCuda.cu create mode 100644 src/data_structures/APR/access/LinearAccessCuda.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7daa68a3..56cd98ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,6 +226,7 @@ if(APR_USE_CUDA) src/algorithm/LocalIntensityScale.cu src/algorithm/OVPC.cu src/data_structures/APR/access/GPUAccess.cu + src/data_structures/APR/access/LinearAccessCuda.cu src/numerics/miscCuda.cu src/numerics/APRDownsampleGPU.cu src/numerics/PixelNumericsGPU.cu diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp index c6756df6..05b0b723 100644 --- a/src/algorithm/PullingScheme.hpp +++ b/src/algorithm/PullingScheme.hpp @@ -13,14 +13,21 @@ #include "data_structures/Mesh/ImagePatch.hpp" #include +// Main types #define EMPTY 0 #define SEED_TYPE 1 #define BOUNDARY_TYPE 2 #define FILLER_TYPE 3 + +// Type used in linear/random access +#define UPSAMPLING_SEED_TYPE 4 + +// Types specific for this implementation of Pulling Scheme (OVPC is not using them) #define ASCENDANT 8 #define PROPOGATE 15 #define ASCENDANTNEIGHBOUR 16 + #define NEIGHBOURLOOP(jn,in,kn, boundaries) \ for(jn = boundaries[0][0]; jn < boundaries[0][1]; jn++) \ for(in = boundaries[1][0]; in < boundaries[1][1]; in++) \ diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index b00a02b0..b92476c2 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -11,6 +11,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" +#include "algorithm/PullingScheme.hpp" #include "APRAccessStructures.hpp" @@ -225,6 +226,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); + // ********************************************************************************************************************* + // FULL RESOLUTION + // ********************************************************************************************************************* //edge case if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; @@ -254,10 +258,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet return; } - // ======================================================================== + // ********************************************************************************************************************* + // FIRST STEP + // ********************************************************************************************************************* apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; @@ -288,7 +293,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } apr_timer.stop_timer(); - // ======================================================================== + // ********************************************************************************************************************* + // SECOND STEP + // ********************************************************************************************************************* apr_timer.start_timer("second_step"); @@ -323,14 +330,15 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* std::vector temp_max_xz; temp_max_xz.resize(genInfo->z_num[genInfo->l_max - 1]*genInfo->x_num[genInfo->l_max - 1],0); - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ - size_t l_minus_1 = genInfo->l_max - 1; const size_t xLen = genInfo->x_num[l_minus_1]; const size_t zLen = genInfo->z_num[l_minus_1]; @@ -404,6 +412,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet apr_timer.stop_timer(); + + // ********************************************************************************************************************* + // THIRD STEP - Get Y values + // ********************************************************************************************************************* + apr_timer.start_timer("init y"); genInfo->total_number_particles = xz_end_vec.back(); @@ -447,10 +460,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ + // ********************************************************************************************************************* + // 4th STEP LAST LEVEL + // + // l_max - 1 is special as it also has the l_max information that then needs to be upsampled. + // ********************************************************************************************************************* #ifdef HAVE_OPENMP @@ -540,7 +554,6 @@ inline void LinearAccess::initialize_linear_structure_sparse(APRParameters& apr_ // ======================================================================== apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu new file mode 100644 index 00000000..8ce7e347 --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -0,0 +1,593 @@ +#include "LinearAccessCuda.hpp" + +#include "misc/CudaTools.cuh" + +// CUDA version of GenInfo structure +typedef struct GenInfoCuda_t { + int l_min; + int l_max; + + int *org_dims; // fixed size: [3] + + uint8_t number_dimensions; + + int *x_num; + int *y_num; + int *z_num; + + // this differs from original GenInfo structure + // since we need to be able to send data back from GPU to CPU + uint64_t *total_number_particles; + + int *level_size; + + uint64_t get_total_number_particles() const { return *total_number_particles; } + + __device__ int level_max() const { return l_max; } + __device__ int level_min() const { return l_min; } + +} GenInfoCuda; + +// ----------------------------- + +/* + * Class for easy transfering to/from GPU of GenInfo structure. + */ +class GenInfoGpuAccess { + GenInfo &gi; + + cudaStream_t iStream; + + ScopedCudaMemHandler org_dims; + ScopedCudaMemHandler x_num; + ScopedCudaMemHandler y_num; + ScopedCudaMemHandler z_num; + ScopedCudaMemHandler total_number_particles; + ScopedCudaMemHandler level_size; + + +public: + GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) : + gi(genInfo), + iStream(cudaStream), + org_dims(gi.org_dims, 3, iStream), + x_num(gi.x_num.data(), gi.x_num.size(), iStream), + y_num(gi.y_num.data(), gi.y_num.size(), iStream), + z_num(gi.z_num.data(), gi.z_num.size(), iStream), + total_number_particles(&gi.total_number_particles, 1, iStream), + level_size(gi.level_size.data(), gi.level_size.size(), iStream) + { + } + + GenInfoCuda getGenInfoCuda() { + GenInfoCuda gic; + + gic.l_min = gi.l_min; + gic.l_max = gi.l_max; + gic.org_dims = org_dims.get(); + gic.number_dimensions = gi.number_dimensions; + gic.x_num = x_num.get(); + gic.y_num = y_num.get(); + gic.z_num = z_num.get(); + gic.total_number_particles = total_number_particles.get(); + gic.level_size = level_size.get(); + + return gic; + } + + ~GenInfoGpuAccess() { + copyDtoH(); + } + + void copyHtoD() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyH2D(); + } + + void copyDtoH() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyD2H(); + } +}; + +typedef ScopedCudaMemHandler, H2D | D2H> ParticleCellTreeLevelCuda; +typedef std::vector ParticleCellTreeCuda; + +// ********************************************************************************************************************* +// FULL RESOLUTION +// ********************************************************************************************************************* +/** + * Handle edge case for #levels <= 2 + * For performance reasons and clarity of the code, + * it doesn't make sense here to handle these cases. + * Below assumes there is at least levels <=2; + * @param level_xz + * @param xz_end + * @param y + * @param gic - cuda version of GenInfo + */ +__global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfoCuda gic) { + + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const unsigned levelMax = gic.level_max(); + const uint64_t xMax = gic.x_num[levelMax]; + const uint64_t yMax = gic.y_num[levelMax]; + const uint64_t zMax = gic.z_num[levelMax]; + + + if (x < xMax && z < zMax) { + const uint64_t levelStart = level_xz[levelMax]; + uint64_t offset_pc_data = z * xMax + x; + uint64_t particleCounter = (1 + x + z * xMax) * yMax; + + xz_end[levelStart + offset_pc_data] = particleCounter; + + for (int i = 0; i < yMax; ++i) { + uint64_t idx = (xMax * z + x) * yMax + i; + y[idx] = i; + } + } + + if (x == 0 && z == 0) { + *gic.total_number_particles = xMax * yMax * zMax; + } +} + +void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + fullResolution<<>>(level_xz, xz_end, y, giga.getGenInfoCuda()); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFullResolution failed"); + } +} + + +// ********************************************************************************************************************* +// FIRST STEP +// ********************************************************************************************************************* + +constexpr uint8_t UPSAMPLING_SEED_TYPE = 4; +static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization + + +__global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + const uint64_t xLenDS = gic.x_num[level - 1]; + const uint64_t yLenDS = gic.y_num[level - 1]; + + if (x < xLen && z < zLen) { + const size_t offset_part_map_ds = (x / 2) * yLenDS + (z / 2) * yLenDS * xLenDS; + const size_t offset_part_map = x * yLen + z * yLen * xLen; + + for (size_t y = 0; y < yLenDS; ++y) { + uint8_t status = prevLevel[offset_part_map_ds + y]; + if (status > 0 && status <= min_type) { + currLevel[offset_part_map + 2 * y] = seed_us; // 2 * y + currLevel[offset_part_map + min(2 * y + 1, yLen - 1)] = seed_us; // 2 * y + 1 + } + } + } +} + +void runFirstStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min + 1; level < gi.l_max; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapPrev = p_map[level - 1]; + auto &p_mapCurr = p_map[level]; + firstStep<<>>(p_mapPrev.get(), p_mapCurr.get(), level, min_type, giga.getGenInfoCuda()); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFirstStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP +// ********************************************************************************************************************* + + +__global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + } + + xz_end[level_start + offset_pc_data] = counter; + } +} + +void runSecondStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapCurr = p_map[level]; + secondStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void secondStepLastLevel(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + const uint64_t zLen_m = gic.z_num[level_minus_1 + 1]; // level max + + const uint64_t level_start = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + uint64_t counter_l = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + else if (status > 0 && status <= min_type) { + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + counter_l++; + } + } + } + + xz_end[level_start + offset_pc_data] = counter; + + // In original CPU code value of counter_l is remembered in temporary buffer and later + // write down to xz_end vector. Here is the solution without need of temp. buffer. + for (size_t dz = 0; dz <= 1; dz++) { + for (size_t dx = 0; dx <= 1; dx++) { + size_t uz = 2 * z + dz; // upsampled z + size_t ux = 2 * x + dx; // upsampled x + if (uz < zLen_m && ux < xLen_m) { + const size_t offset_pc_data_m = uz * xLen_m + ux; + xz_end[level_start_m + offset_pc_data_m] = counter_l; + } + } + } + + } +} + +__global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total) { + // std::partial_sum on one CUDA core naive implementation + size_t sum = xz_end[0]; + for (size_t i = 1; i < counter_total; i++) { + sum += xz_end[i]; + xz_end[i] = sum; + } + + *gic.total_number_particles = xz_end[counter_total -1]; +} + +void runSecondStepLastLevel(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto &p_mapCurr = p_map[level]; + secondStepLastLevel<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #1 failed"); + } + + secondStepCountParticles<<<1, 1, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, counter_total); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #2 failed"); + } +} + + +// ********************************************************************************************************************* +// THIRD STEP - Get Y values +// ********************************************************************************************************************* + + +__global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + uint64_t offset_y = xz_end[level_start + offset_pc_data - 1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + } + } +} + +void runGetYvalues(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapCurr = p_map[level]; + getYvalues<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runGetYvalues failed"); + } +} + + +// ********************************************************************************************************************* +// 4th STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void fourthStep(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + + const uint64_t level_start_minus_1 = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + + const size_t offset_pc_data_m = (z*2) * xLen_m + x * 2; + const size_t offset_part_map = yLen * offset_pc_data; // current level + + uint64_t counter = 0; + uint64_t counter_l = 0; + + uint64_t offset_y = xz_end[level_start_minus_1 + offset_pc_data - 1]; + uint64_t offset_y_m = xz_end[level_start_m + offset_pc_data_m -1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + else if (status > 0 && status <= min_type) { + y_vec[counter_l + offset_y_m] = 2*y; + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + y_vec[counter_l + offset_y_m] = 2*y + 1; + counter_l++; + } + } + } + } +} + +__global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + + int maxLevel = gic.level_max(); + const uint64_t xLen_m = gic.x_num[maxLevel]; // level max + const uint64_t zLen_m = gic.z_num[maxLevel]; // level max + + const uint64_t level_start_m = level_xz[maxLevel]; + + + if (x < xLen_m && z < zLen_m) { + + // first check if it's not already there + if ( ((z % 2) != 0) || ((x % 2) != 0) ) { + const size_t offset_pc_data_m = z * xLen_m + x; + const size_t offset_pc_data_m_f = (z/2) * 2 * xLen_m + (x/2) * 2; + + uint64_t offset_y_b_f = xz_end[level_start_m + offset_pc_data_m_f - 1]; + uint64_t offset_y_e_f = xz_end[level_start_m + offset_pc_data_m_f]; + uint64_t offset_y_b = xz_end[level_start_m + offset_pc_data_m - 1]; + + for (uint64_t idx = offset_y_b_f; idx < offset_y_e_f; ++idx) { + y_vec[offset_y_b++] = y_vec[idx]; + } + } + + } +} + +void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto &p_mapCurr = p_map[level]; + fourthStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #1 failed"); + } + + fourthStepLastLevel<<>>(giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err2 = cudaGetLastError(); + if (err2 != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #2 failed"); + } +} + + +// ********************************************************************************************************************* +// MAIN FUNC TO CALL - implements logic of inearAccess::initialize_linear_structure CPU func. +// ********************************************************************************************************************* + + +/* + * This function does everything: + * - creates CPU structures + * - copies everything to GPU + * - run computation of all linear-structures + * - copy it back to CPU + * - returns all the structure + * + * In current shape it is a good function for testing implementation rather than using it in production code. + * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. + */ +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct) { + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps + ParticleCellTreeCuda p_map; + for (auto &p : pct) { + p_map.emplace_back(std::move(ParticleCellTreeLevelCuda(p))); + } + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData y_vec(true); + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(y_vec); +// prt(xz_end_vec); +// prt(level_xz_vec); + + // TODO: This is temporary solution. + // Since in CPU code size of y_vec is calculated 'on the fly' and in CUDA code it would be much better + // to have pre-allocated memory for that - currently y_vec is pre-allocated to have maximum size. This is not + // optimal but always working solution. If any better idea pop up - it will be changed. + size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max]; + y_vec.resize(maxYvecSize); + + cudaStream_t aStream = nullptr; + { + ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), counter_total, aStream); + } + } + + // TODO: Resized back to correct size, should it be initialized to this size in the first place or pre-allocation for + // full size is more than enough? (for example in case of computing particles for multiple frames with same resolution + // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). + y_vec.resize(gi.total_number_particles); + + + LinearAccessCudaStructs lac; + lac.y_vec.swap(y_vec); + lac.xz_end_vec.swap(xz_end_vec); + lac.level_xz_vec.swap(level_xz_vec); + + return lac; +} diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp new file mode 100644 index 00000000..53dfd001 --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -0,0 +1,17 @@ +#ifndef APR_LINEARACCESSCUDA_HPP +#define APR_LINEARACCESSCUDA_HPP + +#include "algorithm/APRParameters.hpp" +#include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" + +typedef struct { + VectorData y_vec; + VectorData xz_end_vec; + VectorData level_xz_vec; +} LinearAccessCudaStructs; + +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct); + + +#endif //APR_LINEARACCESSCUDA_HPP diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index e0a037f0..f0127920 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -149,10 +149,6 @@ public : usePinnedMemory = usePinned; } - void setUsePinnedMemory(bool usePinned){ - usePinnedMemory = usePinned; - } - inline uint64_t size() const{ return vec.size(); } @@ -283,8 +279,19 @@ public : std::swap(usePinnedMemory, aObj.usePinnedMemory); std::swap(vecMemory, aObj.vecMemory); vec.swap(aObj.vec); +#ifdef APR_USE_CUDA + std::swap(vecMemoryPinned, aObj.vecMemoryPinned); +#endif } + VectorData(VectorData &&aObj) { + usePinnedMemory = aObj.usePinnedMemory; + vecMemory.swap(aObj.vecMemory); + vec = std::move(aObj.vec); +#ifdef APR_USE_CUDA + vecMemoryPinned =std::move(aObj.vecMemoryPinned); +#endif + } /** * Apply unary operator to each element in parallel, writing the result to VectorData 'output'. diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 24e961fd..7d1c4059 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -3,27 +3,98 @@ #include "algorithm/LocalParticleCellSet.hpp" #include "algorithm/PullingScheme.hpp" #include "algorithm/APRConverter.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "TestTools.hpp" - -template -void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); - LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); -} +namespace { + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num / 2.0), ceil(levels.x_num / 2.0), ceil(levels.z_num / 2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); + } /** * Prints PCT * @param particleCellTree */ -template -void printParticleCellTree(const std::vector> &particleCellTree) { - for (uint64_t l = 0; l < particleCellTree.size(); ++l) { - auto &tree = particleCellTree[l]; -// std::cout << "-- level = " << l << ", " << tree << std::endl; - tree.printMeshT(3,0); + template + void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; + tree.printMeshT(3, 0); + } + } + + /** + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * if levels are not provided PCT with EMPTY values is returned + * @return Particle Cell Tree with values (or with EMPTY if levels are not provided) + */ + auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + // Fill particle cell tree only if levels provided - otherwise return tree with EMPTY values + if (levels.size() != 0) { + + int l = aprInfo.l_min; + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); + } + for (auto &level: levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " + << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); + } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; + } + } + return pct; + } + + // Copy PCT - copies only existing levels of it. + auto copyPCT(const std::vector> &pct) { + std::vector> copy; + copy.resize(pct.size()); + + for (int l = 0; l < pct.size(); ++l) { + copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num); + // Copy only existing levels + if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]); + } + + return copy; } + + // Create random Particle Cell Tree with dimensions specified in 'gi' with given number of particles. + auto makeRandomPCT(const GenInfo &gi, int numOfParticles = 3) { + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + int seed = std::time(nullptr); + std::srand(seed); + for (int i = 0; i < numOfParticles; ++i) { + int modulo = (gi.l_max - gi.l_min); + if (modulo == 0) modulo = 1; + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = std::rand() % modulo + gi.l_min; + } + fillPS(ps, levels); + ps.pulling_scheme_main(); + + return copyPCT(ps.getParticleCellTree()); + } + } TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { @@ -113,8 +184,8 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { // int values[] = {0,0,0,5, 0,0,0,0}; // int len = sizeof(values)/sizeof(int); - PixelData levels(3, 1, 1, 0); - levels(0,0,0) = 4; + PixelData levels(8, 1, 1, 0); + levels(5,0,0) = 1; // initFromZYXarray(levels, values); std::cout << "---------------\n"; @@ -145,6 +216,7 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { ps.pulling_scheme_main(); t.stop_timer(); + // Useful during debugging and can be removed once finished std::cout << "----------PS:\n"; printParticleCellTree(ps.getParticleCellTree()); std::cout << "-------------\n"; @@ -180,6 +252,133 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { std::cout << std::endl; } +// ********************************************************************************************************************* +// Tests of CUDA implementation of LinearAccess +// ********************************************************************************************************************* + + +TEST(LinearAccessCudaTest, optimizationForSmallLevels) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + auto linearAccess = initializeLinearStructureCuda(gi, par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + // Useful during debugging and can be removed once finished + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); + EXPECT_EQ(gi.total_number_particles, 4 * 3 * 2); +} + +TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 for all possible combination of xyz + // For bigger xyz 'optimized' part of code is not used + + for (int x = 1; x <= 4; ++x) { + for (int y = 1; y <= 4; ++y) { + for (int z = 1; z <= 4; ++z) { + std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; + // --- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + std::cout << gi << std::endl; + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + GenInfo giGpu; + giGpu.init(y, x, z); + auto pctGpu = makePCT(giGpu, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + +TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) { + + for (int x : {1, 2, 4, 100, 255}) { + for (int y : {1, 2, 4, 100, 256}) { + for (int z : {1, 2, 4, 100, 257}) { +// std::cout << "< ============================================= " << y << " " << x << " "<< z << std::endl; + + // ----------- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + + auto pct = makeRandomPCT(gi, 133); + + auto pctCpu = copyPCT(pct); + auto pctGpu = copyPCT(pct); + + GenInfo giGpu; + giGpu.init(y, x, z); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + + // --------- methods under test + APRTimer t(false); + t.start_timer("__________________________ CPU"); + // --- Method under test + linearAccess.initialize_linear_structure(par, pctCpu); + t.stop_timer(); + + t.start_timer("_________________________ GPU"); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + t.stop_timer(); + + + // ----------- verify results + + // LinearAccess changes PCT - compare if changes in CPU and GPU side are same + EXPECT_EQ(compareParticleCellTrees(pctCpu, pctGpu), 0); + + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 53eec162..5c4ebcb3 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -83,39 +83,6 @@ void printParticleCellTree(const std::vector> &particleCellTree) { } } -/** - * Compare - * @param expected - expected levels - * @param tested - levels to verify - * @param maxError - * @param maxNumOfErrPrinted - how many error outputs should be printed - * @return - */ -template -int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { - int cntGlobal = 0; - for (size_t level = 0; level < expected.size(); level++) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected[level].mesh.size(); ++i) { - if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { - if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || - std::isnan(tested[level].mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " - << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; - } - cnt++; - } - if (expected[level].mesh[i] > 0) numOfParticles++; - } - } - cntGlobal += cnt; - if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; - } - return cntGlobal; -} - template void fillPS(PullingScheme &aPS, PixelData &levels) { PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index be922d90..eeee9718 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -91,39 +91,6 @@ namespace { return true; } - /** - * Compare - * @param expected - expected levels - * @param tested - levels to verify - * @param maxError - * @param maxNumOfErrPrinted - how many error outputs should be printed - * @return - */ - template - int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { - int cntGlobal = 0; - for (size_t level = 0; level < expected.size(); level++) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected[level].mesh.size(); ++i) { - if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { - if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || - std::isnan(tested[level].mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " - << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; - } - cnt++; - } - if (expected[level].mesh[i] > 0) numOfParticles++; - } - } - cntGlobal += cnt; - if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; - } - return cntGlobal; - } - template void fillPS(PullingScheme &aPS, PixelData &levels) { PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 2baa2369..c6accd9a 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -120,9 +120,9 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste template inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTypeB &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 10) { int64_t cnt = 0; - if(expected.size() != tested.size()) { - std::cerr << "ERROR compareParticles: sizes differ!" << std::endl; - cnt++; + if (expected.size() != tested.size()) { + std::cerr << "ERROR compareParticles: sizes differs! " << expected.size() << " vs. " << tested.size() << std::endl; + return 1; // Return any number > 0 to indicate an error } for (size_t i = 0; i < expected.size(); ++i) { @@ -139,6 +139,40 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp return cnt; } +/** + * Compares two Particle Cell Trees + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @param maxTypeCompared - maximum type to be compared + * @return + */ +template +int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, bool printErrors = true, int maxNumOfErrPrinted = 3, uint8_t maxTypeCompared = FILLER_TYPE) { + int cntGlobal = 0; + for (size_t level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= maxTypeCompared) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected[level].mesh[i] > 0) numOfParticles++; + } + } + cntGlobal += cnt; + if (cnt > 0 && printErrors) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; +} + /** * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y From e1b63d7a6d95b1f52d7d9a3d366cfbaa064a537b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 2 Aug 2024 16:17:35 +0200 Subject: [PATCH 43/80] Compiler warnings fixed --- src/data_structures/APR/access/RandomAccess.hpp | 4 ++-- test/APRTest.cpp | 2 +- test/LinearAccessCudaTest.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp index aa8f67bc..18366d99 100644 --- a/src/data_structures/APR/access/RandomAccess.hpp +++ b/src/data_structures/APR/access/RandomAccess.hpp @@ -1513,7 +1513,7 @@ inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(AP gap.global_index_begin_offset = 0; uint64_t counter = 0; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init auto& mesh = p_map.data[i][offset_pc_data][0].mesh; @@ -1577,7 +1577,7 @@ inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(AP auto& mesh = p_map.data[i][offset_pc_data1][0].mesh; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init //SPARSE iteration for (auto it=mesh.begin(); it!=mesh.end(); ++it) { diff --git a/test/APRTest.cpp b/test/APRTest.cpp index 33ea37d6..83071a7f 100644 --- a/test/APRTest.cpp +++ b/test/APRTest.cpp @@ -134,7 +134,7 @@ bool compare_two_iterators(Iterator1& it1, Iterator2& it2, int maxNumOfErrPrinte uint64_t counter_1 = 0; uint64_t counter_2 = 0; - uint64_t errors = 0; + int64_t errors = 0; for (int level = it1.level_min(); level <= it1.level_max(); ++level) { for (int z = 0; z < it1.z_num(level); z++) { diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 7d1c4059..1b7dee46 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -63,7 +63,7 @@ namespace { std::vector> copy; copy.resize(pct.size()); - for (int l = 0; l < pct.size(); ++l) { + for (size_t l = 0; l < pct.size(); ++l) { copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num); // Copy only existing levels if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]); From 4c88fae902b2469940a13411d2102d728f72654f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 6 Aug 2024 12:23:10 +0200 Subject: [PATCH 44/80] Removed debug outputs from LinearAccessCuda test. --- test/LinearAccessCudaTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 1b7dee46..84cf8730 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -292,11 +292,11 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { for (int x = 1; x <= 4; ++x) { for (int y = 1; y <= 4; ++y) { for (int z = 1; z <= 4; ++z) { - std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; +// std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; // --- Create input data structures and objects GenInfo gi; gi.init(y, x, z); - std::cout << gi << std::endl; + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) GenInfo giGpu; giGpu.init(y, x, z); From 169cd9dc4ce043cd536ce4d097350f8130cc5662 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 6 Aug 2024 17:28:36 +0200 Subject: [PATCH 45/80] Added two more test for full pipeline (including PS, and LinearAccess) --- test/FullPipelineCudaTest.cpp | 182 ++++++++++++++++++++++++++++++---- 1 file changed, 163 insertions(+), 19 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 6528227a..eb88b850 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -5,6 +5,8 @@ #include "algorithm/LocalIntensityScale.hpp" #include "algorithm/ComputeGradient.hpp" #include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "TestTools.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" @@ -18,22 +20,24 @@ namespace { // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(100, 100, 100, 13); + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - PixelData grad_temp; // should be a down-sampled image - grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + local_scale_temp2.initDownsampled(dim, false); - PixelData grad_temp_GPU; // should be a down-sampled image - grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); - PixelData local_scale_temp2_GPU; - local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); // Prepare parameters APRParameters par; @@ -45,16 +49,14 @@ namespace { par.dy = 1; par.dz = 1; - // Calculate bspline on CPU - PixelData mCpuImage(input_image, true); + // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); timer.stop_timer(); - // Calculate bspline on GPU - PixelData mGpuImage(input_image, true); + // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); @@ -65,7 +67,149 @@ namespace { EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } - TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GPT) { + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = aprInfo.l_max - 1; + int levelMin = aprInfo.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + GenInfo giGpu; + giGpu.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = giGpu.l_max - 1; + int levelMin = giGpu.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { APRTimer timer(true); // Generate random mesh - keep it large enough to catch all possible computation errors @@ -98,7 +242,7 @@ namespace { par.dy = 1; par.dz = 1; - // Calculate bspline on CPU + // Calculate pipeline on CPU PixelData mCpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); @@ -107,7 +251,7 @@ namespace { timer.stop_timer(); - // Calculate bspline on GPU + // Calculate pipeline on GPU PixelData mGpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); From dadf92f1a813355371ee415e4a43831a3a19acf6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 8 Aug 2024 13:00:33 +0200 Subject: [PATCH 46/80] -ffast-math must be removed - some optimizations still make GPU and CPU computations different --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56cd98ec..9cf047e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,14 +174,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") if(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math -fno-unsafe-math-optimizations") + set(CMAKE_CXX_FLAGS_RELEASE "-O4") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic") if(NOT WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz") endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math -fno-unsafe-math-optimizations") + set(CMAKE_CXX_FLAGS_RELEASE "-O3") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz") endif() From 27a8dc3a552f8a8e9de9d45d1ae2d59599fcefc6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 8 Aug 2024 13:02:24 +0200 Subject: [PATCH 47/80] (nasty) fix for computeLevels in CUDA - added TODO to make it more reliable in future --- src/algorithm/ComputePullingScheme.cuh | 9 +++++++-- test/TestTools.hpp | 26 ++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/algorithm/ComputePullingScheme.cuh b/src/algorithm/ComputePullingScheme.cuh index 28450f30..51b88143 100644 --- a/src/algorithm/ComputePullingScheme.cuh +++ b/src/algorithm/ComputePullingScheme.cuh @@ -9,8 +9,13 @@ template __global__ void computeLevels(const T *grad, float *lis, size_t len, float mult_const) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { - //divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) - uint32_t d = (grad[idx] / lis[idx]) * mult_const; + // divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) + // TODO: This part is using a "trick" to convert first to int and then to uint32_t + // Without that some numbers on CPU and GPU are converted to different values... + // For example -6507.28 without conversion to int is converted to 0 but in CPU we got huge value. + // Anyway - both CPU & GPU sides should be checked and maybe some better way of it should be + // used - currently we've got undefined result of such operation. + uint32_t d = (int)((grad[idx] / lis[idx]) * mult_const); //incorporate other factors and compute the level of the Particle Cell, effectively construct LPC L_n lis[idx] = (d == 0) ? 0 : 31 - __clz(d); // fast log2 } diff --git a/test/TestTools.hpp b/test/TestTools.hpp index c6accd9a..53d6ff55 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -67,7 +67,7 @@ inline bool initFromZYXarray(PixelData &mesh, const T *data) { * @return number of errors detected */ template -inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { +inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0, int maxNumOfErrPrinted = 3) { if (expected.getDimension() != tested.getDimension()) { std::stringstream errMsg; errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension(); @@ -86,7 +86,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] - << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << i << "=" << tested.getStrIndex(i) << std::endl; } cnt++; } @@ -213,6 +213,28 @@ inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); } +template +inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { + PixelData m(y, x, z, 0); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 1.0); + + int count = 0; + for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) { + for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) { + for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) { + m(yi, xi, zi) = 30 ;//+ dist(mt) * 10; + count++; + } + } + } + std::cout << "COUNT: " << count << std::endl; + + return m; +} + struct TestBenchStats{ double inf_norm=0; From bb3b3f4e869a064f361b5535dcdbbd54c93668c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 9 Aug 2024 18:34:09 +0200 Subject: [PATCH 48/80] Fix for bsplineYdir for very small input images + test for full pipeline updated --- src/algorithm/bsplineYdir.cuh | 4 +- src/misc/CudaMemory.cuh | 18 +- src/misc/CudaTools.cuh | 9 +- test/FullPipelineCudaTest.cpp | 534 ++++++++++++++++++++-------------- test/TestTools.hpp | 24 +- 5 files changed, 345 insertions(+), 244 deletions(-) diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index b487cb63..e9905b64 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -86,7 +86,7 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCud } int offs = i % p.k0; int work = i / p.k0; - if (work + xzIndexOfBlock < maxXZoffset) { + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs]; } } @@ -114,7 +114,7 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCud } int offs = i % p.k0; int work = i / p.k0; - if (work + xzIndexOfBlock < maxXZoffset) { + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } diff --git a/src/misc/CudaMemory.cuh b/src/misc/CudaMemory.cuh index e237779f..fbe125e9 100644 --- a/src/misc/CudaMemory.cuh +++ b/src/misc/CudaMemory.cuh @@ -11,14 +11,20 @@ #include -inline cudaError_t checkCuda(cudaError_t result) { -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); + +// TODO: this method is duplicated in CudaTools.cuh +// Somehow including it here break compilation - fix it please. +#define checkCuda(ans) { cudaAssert2((ans), __FILE__, __LINE__); } +inline void cudaAssert2(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow + if (abort) exit(code); } #endif - return result; } inline void* getPinnedMemory(size_t aNumOfBytes) { diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 155ce317..10e4cb73 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -18,10 +18,11 @@ #define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { -#if defined(DEBUG) || defined(_DEBUG) +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) if (code != cudaSuccess) { fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow if (abort) exit(code); } #endif @@ -38,12 +39,6 @@ inline void printCudaDims(const dim3 &threadsPerBlock, const dim3 &numBlocks) { std::cout << "Number of threads (x/y/z): " << threadsPerBlock.x << "/" << threadsPerBlock.y << "/" << threadsPerBlock.z << std::endl; } -template -inline void getDataFromKernel(PixelData &input, size_t inputSize, ImgType *cudaInput) { - cudaMemcpy(input.mesh.get(), cudaInput, inputSize, cudaMemcpyDeviceToHost); - cudaFree(cudaInput); -} - class CudaTimer { std::vector iStartTimes; std::vector names; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index eb88b850..41e865d3 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -15,254 +15,338 @@ namespace { #ifdef APR_USE_CUDA + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, true); + PixelData grad_temp_GPU(grad_temp, true, true); + PixelData local_scale_temp_GPU(local_scale_temp, true, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, true); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + } + } + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d%2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, false); + PixelData grad_temp_GPU(grad_temp, true, false); + PixelData local_scale_temp_GPU(local_scale_temp, true, false); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters and APR info structures - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet lpcs = LocalParticleCellSet(); - lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); - ps.pulling_scheme_main(); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = aprInfo.l_max - 1; - int levelMin = aprInfo.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = aprInfo.l_max - 1; + int levelMin = aprInfo.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters and APR info structures - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - par.neighborhood_optimization = true; - - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - GenInfo giGpu; - giGpu.init(input_image.getDimension()); - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet lpcs = LocalParticleCellSet(); - lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); - ps.pulling_scheme_main(); - LinearAccess linearAccess; - linearAccess.genInfo = &aprInfo; - - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = giGpu.l_max - 1; - int levelMin = giGpu.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); - auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - // Test if returned structures have same data - EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); - EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); - EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); - - EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); - EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + GenInfo giGpu; + giGpu.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = giGpu.l_max - 1; + int levelMin = giGpu.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) + // It fails for {4,4,3} for sure and surprisingly only for mesh with blob inside... + // Investigate why it fails while it works nicely in tests above (difference must be somewhere in GpuProcessingTask). + + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 99, 0, false); - int maxLevel = ceil(std::log2(dim.maxDimSize())); - - PixelData grad_temp; // should be a down-sampled image - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp.initDownsampled(dim,false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - PixelData grad_temp_GPU; // should be a down-sampled image - grad_temp_GPU.initDownsampled(dim, 0, false); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(dim, false); - PixelData local_scale_temp2_GPU; - local_scale_temp2_GPU.initDownsampled(dim, false); - - // Prepare parameters - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - // Calculate pipeline on CPU - PixelData mCpuImage(input_image, true); - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - - // Calculate pipeline on GPU - PixelData mGpuImage(input_image, true); - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - - { - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - gpt.doAll(); - } - timer.stop_timer(); + constexpr PixelDataDim dim1{3, 8, 8}; + constexpr PixelDataDim dim2{4, 4 ,3}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(dim.maxDimSize())); + + std::cout << "--------------------------> " << dim << " " << (bool)(d/2 == 0) << std::endl; + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); +// LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + + // Calculate pipeline on GPU + PixelData mGpuImage(input_image, true); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(dim, false); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + + { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.doAll(); + } + timer.stop_timer(); + if (dim.y < 5 ) { + local_scale_temp.printMesh(3, 2); + local_scale_temp_GPU.printMesh(3, 2); + } + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); - // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } } #endif // APR_USE_CUDA } diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 53d6ff55..158bf2ea 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -213,6 +213,14 @@ inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); } +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param y + * @param x + * @param z + * @return + */ template inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { PixelData m(y, x, z, 0); @@ -221,20 +229,28 @@ inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); - int count = 0; for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) { for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) { for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) { - m(yi, xi, zi) = 30 ;//+ dist(mt) * 10; - count++; + m(yi, xi, zi) = 30 + dist(mt) * 10; } } } - std::cout << "COUNT: " << count << std::endl; return m; } +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param dim + * @return + */ +template +inline PixelData getMeshWithBlobInMiddle(const PixelDataDim &dim) { + return getMeshWithBlobInMiddle(dim.y, dim.x, dim.z); +} + struct TestBenchStats{ double inf_norm=0; From a8c4d77c33c925f518d4f441ab4ae0858e090e1d Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 14 Aug 2024 15:30:18 +0200 Subject: [PATCH 49/80] Fixed Local Intensity Scale (LIS) for super small inputs --- src/algorithm/LocalIntensityScale.cu | 6 ++--- test/FullPipelineCudaTest.cpp | 33 ++++++++++++------------- test/LocalIntensityScaleCudaTest.cpp | 37 ++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 2b5c186d..1593b5ab 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -480,9 +480,9 @@ __global__ void constantScale(S *image, size_t len) { } template -void runConstantScale(S *image, PixelDataDim &dim) { +void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) { // Check kernel description for further info! - constantScale<<<1, 1>>>(image, dim.size()); + constantScale<<<1, 1, 0, aStream>>>(image, dim.size()); } template @@ -551,7 +551,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete } } else { - runConstantScale(cudaImage, imageSize); + runConstantScale(cudaImage, imageSize, aStream); } } diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 41e865d3..a92abc08 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -26,7 +26,6 @@ namespace { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); // Initialize CPU data structures PixelData mCpuImage(input_image, true); @@ -292,23 +291,29 @@ namespace { // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim1{3, 8, 8}; - constexpr PixelDataDim dim2{4, 4 ,3}; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; for (int d = 0; d <= 3; d++) { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); int maxLevel = ceil(std::log2(dim.maxDimSize())); - std::cout << "--------------------------> " << dim << " " << (bool)(d/2 == 0) << std::endl; - - PixelData grad_temp; // should be a down-sampled image + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + PixelData local_scale_temp; local_scale_temp.initDownsampled(dim, false); PixelData local_scale_temp2; local_scale_temp2.initDownsampled(dim, false); + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + // Prepare parameters APRParameters par; par.lambda = 3; @@ -318,31 +323,25 @@ namespace { par.dx = 1; par.dy = 1; par.dz = 1; + par.neighborhood_optimization = true; // Calculate pipeline on CPU - PixelData mCpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); -// LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); timer.stop_timer(); // Calculate pipeline on GPU - PixelData mGpuImage(input_image, true); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(dim, false); timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - { GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); gpt.doAll(); } timer.stop_timer(); - if (dim.y < 5 ) { - local_scale_temp.printMesh(3, 2); - local_scale_temp_GPU.printMesh(3, 2); - } + // Compare GPU vs CPU - expect exactly same result EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index ce6ff111..39f8ff22 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -558,6 +558,43 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_SUPER_SMALL) { + // In case of very small input image like 2x2x2 constant scale is being used + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(2,2,2, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + mCpu.printMesh(3,2); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results - only mGPU mattters since mGpuTemp in case of constant scale is not modified + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) { APRTimer timer(false); From e6e43274859466c9b574026f9dd2d49914f5170a Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 19 Aug 2024 17:09:16 +0200 Subject: [PATCH 50/80] ParticleCellTreeCuda is now main stuff for CUDA --- src/algorithm/OVPC.cu | 125 ++++++++++++---------------- src/algorithm/PullingSchemeCuda.hpp | 7 +- test/FullPipelineCudaTest.cpp | 14 ++-- test/PullingSchemeCudaTest.cpp | 30 ++----- 4 files changed, 67 insertions(+), 109 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 070c4d81..c68fd63f 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -107,79 +107,53 @@ void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, siz secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; -// explicit instantiation of handled types -template void computeOVPC(const PixelData&, PixelData&, int, int); - -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { - - // TODO: Depending on implementation of computing particles (next step after OVPC) some port of this method - // might be useful. Leaving it here rigtht now just in case. If not needed in next steps DELETE IT. - - ScopedCudaMemHandler, H2D> in(input); - ScopedCudaMemHandler, D2H> mem(output); - - - CudaTimer t(true, "OVPCCUDA"); - - t.start_timer("wait"); - waitForCuda(); - t.stop_timer(); - - t.start_timer("ALL"); - // TODO: This is not needed later - just for having clear debug - //cudaMemset(mem.get(), 0, mem.getNumOfBytes()); - - // =============== Create pyramid - std::vector levels(levelMax + 1, nullptr); - std::vector xSize(levelMax + 1); - std::vector ySize(levelMax + 1); - std::vector zSize(levelMax + 1); - - int xDS = input.x_num; - int yDS = input.y_num; - int zDS = input.z_num; - - size_t offset = 0; - for (int l = levelMax; l >= levelMin; --l) { - levels[l] = reinterpret_cast(mem.get()) + offset; - xSize[l] = xDS; - ySize[l] = yDS; - zSize[l] = zDS; - - offset += xDS * yDS * zDS * sizeof(TreeElementType); - // round up to 16-bytes - const size_t alignemet = 16; - offset = ((offset + alignemet - 1) / alignemet ) * alignemet; +class ParticleCellTreeCuda { + ScopedCudaMemHandler mem; + std::vector startOffsets; + GenInfo gi; + size_t numOfElements = 0; + cudaStream_t stream = nullptr; + +public: + + ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { + // Calculate size of needed memory for PCT and offsets for particular levels + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + startOffsets.resize(l_max + 1, 0); + + for (int l = l_min; l <= l_max; ++l) { + auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); + auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); + auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); + size_t levelSize = yLen * xLen * zLen; + startOffsets[l] = numOfElements; + numOfElements += levelSize; + } - xDS = ceil(xDS/2.0); - yDS = ceil(yDS/2.0); - zDS = ceil(zDS/2.0); + // Initialize memory, it is not binded to any CPU memory so we provide nullptr + mem.initialize(nullptr, numOfElements, stream); + cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); } + inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } - runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); + auto getPCTcpu() { + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); - for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0); + return pct; } +}; - // ================== Phase 1 - top to down - for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(levels[l], xSize[l], ySize[l], zSize[l], l, 0); - } - // ================== Phase 1 - down to top - for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); - } - waitForCuda(); - t.stop_timer(); -} - // explicit instantiation of handled types -template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); -template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); /** * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). @@ -191,30 +165,33 @@ template void computeOvpcCuda(const PixelData &input, std::vector -void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax) { +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi) { // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps + + ParticleCellTreeCuda pct(gi, 0 /*stream*/); + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; + ScopedCudaMemHandler, H2D> in(input); - std::vector, D2H>> w; - for (int l = 0; l <= levelMax; ++l) { - w.push_back(std::move(ScopedCudaMemHandler, D2H>(pct[l]))); - } // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range - runCopyAndClampLevels(in.get(), w[levelMax].get(), in.getSize(), levelMin, levelMax, 0); + runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, 0); // Downsample with max reduction to levelMin to fill the rest of the tree for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(w[l + 1].get(), w[l].get(), pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, 0); + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], 0); } // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(w[l].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, l, 0); + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, 0); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(w[l].get(), w[l+1].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, l == levelMin, 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, 0); } + + return pct.getPCTcpu(); } diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index f98c0883..236c260f 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -7,12 +7,11 @@ #include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" using TreeElementType = uint8_t; -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax); -template -void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index a92abc08..95c2b07c 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -10,6 +10,7 @@ #include "TestTools.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" +#include "misc/CudaTools.cuh" namespace { @@ -186,10 +187,7 @@ namespace { getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = aprInfo.l_max - 1; - int levelMin = aprInfo.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(local_scale_temp_GPU, aprInfo); timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result @@ -197,6 +195,9 @@ namespace { } } + + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { APRTimer timer(true); @@ -263,10 +264,7 @@ namespace { getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = giGpu.l_max - 1; - int levelMin = giGpu.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu); auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); timer.stop_timer(); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5c4ebcb3..bd24156e 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -95,7 +95,7 @@ void fillPS(PullingScheme &aPS, PixelData &levels) { TEST(PullingSchemeTest, PSvsOVPCCUDA) { // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC GenInfo gi; - gi.init(255, 257, 199); + gi.init(255, 157, 257); // Generate random levels for PS and OVPC PixelData levels(std::ceil(gi.org_dims[0]/2.0), @@ -113,7 +113,7 @@ TEST(PullingSchemeTest, PSvsOVPCCUDA) { PixelData levelsPS(levels, true); // Initialize all needed objects - APRTimer t(false); + APRTimer t(true); t.start_timer("PS - init"); PullingScheme ps; @@ -125,19 +125,15 @@ TEST(PullingSchemeTest, PSvsOVPCCUDA) { t.stop_timer(); // Run test methods and compare results - t.start_timer("OVPCCUDA - init"); - int levelMax = gi.l_max - 1; - int levelMin = gi.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levelsOVPC, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levelsOVPC, gi); t.stop_timer(); // -------------- Verify result ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); } + TEST(PullingSchemeTest, OVPCCUDA_Ydir) { // Prepare input data for PS float values[] = {9,0,0,0, 0,0,0,0}; @@ -157,12 +153,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Ydir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types @@ -199,12 +191,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Xdir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types @@ -241,12 +229,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Zdir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types From 00aac97431bb8feef27f0e0c9eaebdcf1850c86c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 09:30:46 +0200 Subject: [PATCH 51/80] computeOvpcCuda now using 'stream' instead of hardcoded values --- src/algorithm/OVPC.cu | 25 +++++++++++++------------ src/algorithm/PullingSchemeCuda.hpp | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index c68fd63f..a840b7fe 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -158,39 +158,40 @@ template std::vector> computeOvpcCuda(const PixelData&, /** * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). * @tparam T - type of input levels - * @tparam S - type of output Particle Cell Tree * @param input - input levels computed in earlier stages - * @param pct - Particle Cell Tree - as input is used for dimensions of each level, will be filled with computed - * Pulling Scheme as a output - * @param levelMin - min level of APR - * @param levelMax - max level of APR + * @param gi - GenInfo for given APR + * + * @return - PCT for CPU (copied from GPU) */ template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi) { // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps - ParticleCellTreeCuda pct(gi, 0 /*stream*/); + cudaStream_t stream = nullptr; + + ScopedCudaMemHandler, H2D> in(input, stream); + + ParticleCellTreeCuda pct(gi, stream); int levelMin = gi.l_min; int levelMax = gi.l_max - 1; - ScopedCudaMemHandler, H2D> in(input); // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range - runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, 0); + runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, stream); - // Downsample with max reduction to levelMin to fill the rest of the tree + // Downsample with max reduction to levelMin to fill rest of the tree for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], 0); + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); } // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, 0); + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } return pct.getPCTcpu(); diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 236c260f..953903db 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -9,9 +9,11 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" + using TreeElementType = uint8_t; template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); + #endif //LIBAPR_PULLINGSCHEMECUDA_HPP From 1fba1bcdb14044fd517b3f3104ef0041831179c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 09:53:09 +0200 Subject: [PATCH 52/80] ParticleCellTreeCuda moved and handle now cpu2gpu transfer --- src/algorithm/OVPC.cu | 44 +--------------- src/algorithm/ParticleCellTreeCuda.cuh | 70 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 43 deletions(-) create mode 100644 src/algorithm/ParticleCellTreeCuda.cuh diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index a840b7fe..9c6e0bd6 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -5,6 +5,7 @@ #include "misc/CudaTools.cuh" #include "data_structures/Mesh/downsample.cuh" #include "algorithm/OVPC.h" +#include "algorithm/ParticleCellTreeCuda.cuh" template @@ -107,49 +108,6 @@ void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, siz secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; -class ParticleCellTreeCuda { - ScopedCudaMemHandler mem; - std::vector startOffsets; - GenInfo gi; - size_t numOfElements = 0; - cudaStream_t stream = nullptr; - -public: - - ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { - // Calculate size of needed memory for PCT and offsets for particular levels - int l_max = aprInfo.l_max - 1; - int l_min = aprInfo.l_min; - - startOffsets.resize(l_max + 1, 0); - - for (int l = l_min; l <= l_max; ++l) { - auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); - auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); - auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); - size_t levelSize = yLen * xLen * zLen; - startOffsets[l] = numOfElements; - numOfElements += levelSize; - } - - // Initialize memory, it is not binded to any CPU memory so we provide nullptr - mem.initialize(nullptr, numOfElements, stream); - cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); - } - - inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } - - auto getPCTcpu() { - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - for (int i = gi.l_min; i < gi.l_max; ++i) { - checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); - } - checkCuda(cudaStreamSynchronize(stream)); - - return pct; - } -}; - // explicit instantiation of handled types template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh new file mode 100644 index 00000000..9fe38273 --- /dev/null +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -0,0 +1,70 @@ +#ifndef PARTICLE_CELL_TREE_CUDA_CUH +#define PARTICLE_CELL_TREE_CUDA_CUH + + +#include "data_structures/APR/GenInfo.hpp" +#include "algorithm/PullingScheme.hpp" + + +/* + * CUDA representation of PCT (Particle Cell Tree) + * Allocates memory and initialize it to EMPTY + * + * Allows acces to each level via subscription operator: + * ParticleCellTreeCuda pct(aprInfo); + * pct[level] + * + * getPCTcpu and uploadPCT2GPU handle interaction with CPU code (mainly for test/debug purposes). + */ +class ParticleCellTreeCuda { + ScopedCudaMemHandler mem; + std::vector startOffsets; + GenInfo gi; + size_t numOfElements = 0; + cudaStream_t stream = nullptr; + +public: + + ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { + // Calculate size of needed memory for PCT and offsets for particular levels + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + startOffsets.resize(l_max + 1, 0); + + for (int l = l_min; l <= l_max; ++l) { + auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); + auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); + auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); + size_t levelSize = yLen * xLen * zLen; + startOffsets[l] = numOfElements; + numOfElements += levelSize; + } + + // Initialize memory, it is not binded to any CPU memory so we provide nullptr + mem.initialize(nullptr, numOfElements, stream); + cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); + } + + inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } + + auto getPCTcpu() { + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + + return pct; + } + + void uploadPCT2GPU(std::vector> pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } +}; + + +#endif From 34742506ed6b87c5e629d63f11ec13674d6d45e6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 15:02:21 +0200 Subject: [PATCH 53/80] LinearAccessCuda is now using ParticleCellTreeCuda --- src/algorithm/ParticleCellTreeCuda.cuh | 9 ++- .../APR/access/LinearAccessCuda.cu | 56 +++++++++---------- .../APR/access/LinearAccessCuda.hpp | 2 +- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh index 9fe38273..4f520d54 100644 --- a/src/algorithm/ParticleCellTreeCuda.cuh +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -58,7 +58,14 @@ public: return pct; } - void uploadPCT2GPU(std::vector> pct) { + void downloadPCTfromGPU(std::vector> &pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } + + void uploadPCT2GPU(const std::vector> &pct) { for (int i = gi.l_min; i < gi.l_max; ++i) { checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream)); } diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 8ce7e347..aeffa2c0 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -1,6 +1,7 @@ #include "LinearAccessCuda.hpp" #include "misc/CudaTools.cuh" +#include "algorithm/ParticleCellTreeCuda.cuh" // CUDA version of GenInfo structure typedef struct GenInfoCuda_t { @@ -90,9 +91,6 @@ public: } }; -typedef ScopedCudaMemHandler, H2D | D2H> ParticleCellTreeLevelCuda; -typedef std::vector ParticleCellTreeCuda; - // ********************************************************************************************************************* // FULL RESOLUTION // ********************************************************************************************************************* @@ -134,7 +132,7 @@ __global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint1 } } -void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { +void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, const GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, @@ -154,11 +152,10 @@ void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, // FIRST STEP // ********************************************************************************************************************* -constexpr uint8_t UPSAMPLING_SEED_TYPE = 4; static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization -__global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { +__global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; const uint64_t xLen = gic.x_num[level]; @@ -181,16 +178,16 @@ __global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uin } } -void runFirstStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { +void runFirstStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min + 1; level < gi.l_max; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapPrev = p_map[level - 1]; - auto &p_mapCurr = p_map[level]; - firstStep<<>>(p_mapPrev.get(), p_mapCurr.get(), level, min_type, giga.getGenInfoCuda()); + auto *p_mapPrev = p_map[level - 1]; + auto *p_mapCurr = p_map[level]; + firstStep<<>>(p_mapPrev, p_mapCurr, level, min_type, giga.getGenInfoCuda()); } cudaError_t err = cudaGetLastError(); @@ -232,15 +229,15 @@ __global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type } } -void runSecondStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { +void runSecondStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min; level < gi.l_max - 1; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapCurr = p_map[level]; - secondStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + auto *p_mapCurr = p_map[level]; + secondStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); } cudaError_t err = cudaGetLastError(); @@ -323,15 +320,15 @@ __global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_ *gic.total_number_particles = xz_end[counter_total -1]; } -void runSecondStepLastLevel(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { +void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z); int level = gi.l_max - 1; - auto &p_mapCurr = p_map[level]; - secondStepLastLevel<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + auto *p_mapCurr = p_map[level]; + secondStepLastLevel<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -381,15 +378,15 @@ __global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type } } -void runGetYvalues(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { +void runGetYvalues(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min; level < gi.l_max - 1; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapCurr = p_map[level]; - getYvalues<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + auto *p_mapCurr = p_map[level]; + getYvalues<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); } cudaError_t err = cudaGetLastError(); @@ -482,15 +479,15 @@ __global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, u } } -void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { +void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); int level = gi.l_max - 1; - auto &p_mapCurr = p_map[level]; - fourthStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + auto *p_mapCurr = p_map[level]; + fourthStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -524,13 +521,14 @@ void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_ * In current shape it is a good function for testing implementation rather than using it in production code. * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. */ -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct) { +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { + + cudaStream_t aStream = nullptr; + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps - ParticleCellTreeCuda p_map; - for (auto &p : pct) { - p_map.emplace_back(std::move(ParticleCellTreeLevelCuda(p))); - } + ParticleCellTreeCuda p_map (gi, aStream); + p_map.uploadPCT2GPU(pct); uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; @@ -560,7 +558,7 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max]; y_vec.resize(maxYvecSize); - cudaStream_t aStream = nullptr; + { ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); @@ -583,6 +581,8 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). y_vec.resize(gi.total_number_particles); + p_map.downloadPCTfromGPU(pct); + LinearAccessCudaStructs lac; lac.y_vec.swap(y_vec); diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 53dfd001..51148d9e 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -11,7 +11,7 @@ typedef struct { VectorData level_xz_vec; } LinearAccessCudaStructs; -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct); +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); #endif //APR_LINEARACCESSCUDA_HPP From 1d4e54940df6f4f149388d397ddbbcea991979eb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 10:27:34 +0200 Subject: [PATCH 54/80] OVPC added to GpuTask --- src/algorithm/ComputeGradientCuda.cu | 11 ++++++- src/algorithm/OVPC.cu | 31 +++++++++++++++++++ src/algorithm/ParticleCellTreeCuda.cuh | 2 +- src/algorithm/PullingSchemeCuda.hpp | 4 ++- src/data_structures/APR/GenInfo.hpp | 3 ++ .../APR/access/LinearAccessCuda.cu | 3 +- test/FullPipelineCudaTest.cpp | 6 ++-- 7 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 4db49d4d..d092a1b7 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -13,6 +13,8 @@ #include "algorithm/LocalIntensityScale.cuh" #include "misc/CudaTools.cuh" #include "misc/CudaMemory.cuh" +#include "algorithm/ParticleCellTreeCuda.cuh" +#include "algorithm/PullingSchemeCuda.hpp" #include "dsGradient.cuh" #include "invBspline.cuh" @@ -207,6 +209,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const PixelData &iCpuImage; PixelData &iCpuLevels; const APRParameters &iParameters; + GenInfo iAprInfo; float iBsplineOffset; int iMaxLevel; @@ -227,6 +230,8 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const size_t boundaryLen; ScopedCudaMemHandler boundary; + ParticleCellTreeCuda pctc; + /** * @return newly created stream */ @@ -247,6 +252,7 @@ public: local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), iParameters(parameters), + iAprInfo(iCpuImage.getDimension()), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. @@ -257,7 +263,8 @@ public: bc3(params.bc3.get(), params.k0, iStream), bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, - boundary{nullptr, boundaryLen, iStream} + boundary{nullptr, boundaryLen, iStream}, + pctc(iAprInfo, iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; @@ -308,6 +315,8 @@ public: const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); std::cout << "3: " << ct.microseconds() - start << std::endl; + + computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); } ~GpuProcessingTaskImpl() { diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 9c6e0bd6..55656674 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -154,3 +154,34 @@ std::vector> computeOvpcCuda(const PixelData &input, const return pct.getPCTcpu(); } + +// explicit instantiation of handled types +template void computeOvpcCuda(float *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); +template void computeOvpcCuda(int *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); + + +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream) { + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; + + + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in, pct[levelMax], gi.y_num[levelMax]*gi.x_num[levelMax]*gi.z_num[levelMax], levelMin, levelMax, stream); + + // Downsample with max reduction to levelMin to fill rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); + } + + // ================== Phase 1 - top to down + for (int l = levelMin; l <= levelMax; ++l) { + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); + } + // ================== Phase 1 - down to top + for (int l = levelMax - 1; l >= levelMin; --l) { + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); + } + + std::cout << "------- RUN --------------\n"; +} \ No newline at end of file diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh index 4f520d54..d3bc6160 100644 --- a/src/algorithm/ParticleCellTreeCuda.cuh +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -4,7 +4,7 @@ #include "data_structures/APR/GenInfo.hpp" #include "algorithm/PullingScheme.hpp" - +#include "misc/CudaTools.cuh" /* * CUDA representation of PCT (Particle Cell Tree) diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 953903db..12aa81d3 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -8,12 +8,14 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" - +#include "algorithm/ParticleCellTreeCuda.cuh" using TreeElementType = uint8_t; template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index e506100a..7898fc97 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -34,6 +34,9 @@ class GenInfo { std::vector level_size; // precomputation of the size of each level, used by the iterators. + GenInfo() {} + GenInfo(const PixelDataDim &dim) { init(dim); } + //initialize the information given the original dimensions void init(const PixelDataDim &dim) { init(dim.y, dim.x, dim.z); diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index aeffa2c0..2de0dd6a 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -506,7 +506,7 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu // ********************************************************************************************************************* -// MAIN FUNC TO CALL - implements logic of inearAccess::initialize_linear_structure CPU func. +// MAIN FUNC TO CALL - implements logic of LinearAccess::initialize_linear_structure CPU func. // ********************************************************************************************************************* @@ -581,6 +581,7 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). y_vec.resize(gi.total_number_particles); + // Transfer changes to PCT from GPU to CPU (this is needed only for tests) p_map.downloadPCTfromGPU(pct); diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 95c2b07c..975ba0b2 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -238,10 +238,8 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - GenInfo giGpu; - giGpu.init(input_image.getDimension()); + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); From 9ff0580050477adbc63f63c42dc9978cd16db497 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 16:51:30 +0200 Subject: [PATCH 55/80] Full GPU pipeline works1 --- src/algorithm/ComputeGradientCuda.cu | 24 ++++++---- src/algorithm/ComputeGradientCuda.hpp | 4 +- src/data_structures/APR/GenInfo.hpp | 3 ++ .../APR/access/LinearAccessCuda.cu | 48 +++++++++++++++++++ .../APR/access/LinearAccessCuda.hpp | 3 ++ test/FullPipelineCudaTest.cpp | 42 ++++++++++++---- 6 files changed, 105 insertions(+), 19 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index d092a1b7..bee6417f 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -15,6 +15,7 @@ #include "misc/CudaMemory.cuh" #include "algorithm/ParticleCellTreeCuda.cuh" #include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "dsGradient.cuh" #include "invBspline.cuh" @@ -232,6 +233,9 @@ class GpuProcessingTask::GpuProcessingTaskImpl { ParticleCellTreeCuda pctc; + ScopedCudaMemHandler y_vec; // for LinearAccess + LinearAccessCudaStructs lacs; + /** * @return newly created stream */ @@ -264,7 +268,8 @@ public: bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, - pctc(iAprInfo, iStream) + pctc(iAprInfo, iStream), + y_vec(nullptr, iAprInfo.getSize(), iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; @@ -279,12 +284,13 @@ public: std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } - void getDataFromGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); - local_scale_temp.copyD2H(); - checkCuda(cudaStreamSynchronize(iStream)); - std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + LinearAccessCudaStructs getDataFromGpu() { +// CurrentTime ct; +// uint64_t start = ct.microseconds(); +// local_scale_temp.copyD2H(); +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + return std::move(lacs); } void processOnGpu() { @@ -317,6 +323,8 @@ public: std::cout << "3: " << ct.microseconds() - start << std::endl; computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); + computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); + std::cout << iAprInfo << std::endl; } ~GpuProcessingTaskImpl() { @@ -339,7 +347,7 @@ template void GpuProcessingTask::sendDataToGpu() {impl->sendDataToGpu();} template -void GpuProcessingTask::getDataFromGpu() {impl->getDataFromGpu();} +LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index a8ebe1bf..723b6181 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -7,7 +7,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" - +#include "data_structures/APR/access/LinearAccessCuda.hpp" // Test helpers and definitions using TypeOfRecBsplineFlags = uint16_t; @@ -47,7 +47,7 @@ class GpuProcessingTask { GpuProcessingTask(GpuProcessingTask&&); void sendDataToGpu(); - void getDataFromGpu(); + LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); void doAll(); }; diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index 7898fc97..8d5da2bd 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -37,6 +37,8 @@ class GenInfo { GenInfo() {} GenInfo(const PixelDataDim &dim) { init(dim); } + size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; } + //initialize the information given the original dimensions void init(const PixelDataDim &dim) { init(dim.y, dim.x, dim.z); @@ -119,6 +121,7 @@ class GenInfo { friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) { os << "GenInfo {\n"; os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n"; + os << " Original size: " << gi.getSize() << "\n"; os << " Number of dimensions: " << static_cast(gi.number_dimensions) << "\n"; os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n"; os << " total number of particles: " << gi.total_number_particles << "\n"; diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 2de0dd6a..9e38d760 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -592,3 +592,51 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara return lac; } + +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) { + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + + + { + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream); + } + } + +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(y_vec); +// prt(xz_end_vec); +// prt(level_xz_vec); + VectorData y_vec(true); + y_vec.resize(gi.total_number_particles); + checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); + checkCuda(cudaStreamSynchronize(aStream)); + + lacs.y_vec.swap(y_vec); + lacs.xz_end_vec.swap(xz_end_vec); + lacs.level_xz_vec.swap(level_xz_vec); +} diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 51148d9e..27d56ab6 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -4,6 +4,7 @@ #include "algorithm/APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" +#include "algorithm/ParticleCellTreeCuda.cuh" typedef struct { VectorData y_vec; @@ -13,5 +14,7 @@ typedef struct { LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream); + #endif //APR_LINEARACCESSCUDA_HPP diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 975ba0b2..8cc516a4 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -277,7 +277,7 @@ namespace { } } - TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { + TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) { APRTimer timer(true); // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) @@ -288,11 +288,15 @@ namespace { // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; constexpr PixelDataDim dim1{4, 4, 3}; - constexpr PixelDataDim dim2{163, 123, 555}; + constexpr PixelDataDim dim2{1024,512,512}; for (int d = 0; d <= 3; d++) { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : - getMeshWithBlobInMiddle(dim); + getMeshWithBlobInMiddle(dim); + +// constexpr PixelDataDim dim = dim1; +// PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(dim.maxDimSize())); // Initialize CPU data structures @@ -321,32 +325,52 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); + + // Calculate pipeline on CPU // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet lpcs = LocalParticleCellSet(); lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); timer.stop_timer(); // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - { - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - gpt.doAll(); - } + // { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); + giGpu.total_number_particles = linearAccessGpu.y_vec.size(); + + // } timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); } } + #endif // APR_USE_CUDA } - int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From c10225d6bd099273086ff8e68a0234e6661e4542 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:19:48 +0200 Subject: [PATCH 56/80] Some debug prints removed --- src/algorithm/ComputeGradientCuda.cu | 23 +++++++++---------- src/algorithm/OVPC.cu | 2 -- .../APR/access/LinearAccessCuda.cu | 4 ---- test/FullPipelineCudaTest.cpp | 12 +++++----- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index bee6417f..9c85cd0f 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -247,6 +247,9 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: + // TODO: Remove need for passing 'levels' to GpuProcessingTask + // It was used during development to control internal computation like filters, gradient, levels etc. but + // once all is done there is no need for it anymore GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : iCpuImage(inputImage), iCpuLevels(levels), @@ -272,16 +275,16 @@ public: y_vec(nullptr, iAprInfo.getSize(), iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; - std::cout << iCpuImage << std::endl; - std::cout << iCpuLevels << std::endl; +// std::cout << iCpuImage << std::endl; +// std::cout << iCpuLevels << std::endl; } void sendDataToGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); +// CurrentTime ct; +// uint64_t start = ct.microseconds(); image.copyH2D(); - checkCuda(cudaStreamSynchronize(iStream)); - std::cout << "SEND time: " << ct.microseconds() - start << std::endl; +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } LinearAccessCudaStructs getDataFromGpu() { @@ -313,18 +316,14 @@ public: getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); - std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); - std::cout << "2: " << ct.microseconds() - start << std::endl; float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - std::cout << "3: " << ct.microseconds() - start << std::endl; computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); - std::cout << iAprInfo << std::endl; } ~GpuProcessingTaskImpl() { @@ -335,10 +334,10 @@ public: template GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) -: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";} +: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} { } template -GpuProcessingTask::~GpuProcessingTask() {std::cout << "~GpuProcessingTask\n";} +GpuProcessingTask::~GpuProcessingTask() { } template GpuProcessingTask::GpuProcessingTask(GpuProcessingTask&&) = default; diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 55656674..80765bca 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -182,6 +182,4 @@ void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, for (int l = levelMax - 1; l >= levelMin; --l) { runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } - - std::cout << "------- RUN --------------\n"; } \ No newline at end of file diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 9e38d760..1a876d0e 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -627,10 +627,6 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_ma } } -// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; -// prt(y_vec); -// prt(xz_end_vec); -// prt(level_xz_vec); VectorData y_vec(true); y_vec.resize(gi.total_number_particles); checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 8cc516a4..aa706190 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -310,9 +310,7 @@ namespace { // Initialize GPU data structures to same values as CPU PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU(grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); + PixelData local_scale_temp_GPU(local_scale_temp, false); // Prepare parameters APRParameters par; @@ -346,14 +344,16 @@ namespace { // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - // { GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + cudaDeviceSynchronize(); + + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + // { gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); - + cudaDeviceSynchronize(); // } timer.stop_timer(); From 6b7a87d870501b3178c901208825304c7c71d261 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:46:38 +0200 Subject: [PATCH 57/80] Test for full pipeline cleaned up --- test/FullPipelineCudaTest.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index aa706190..8f29141b 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -294,9 +294,6 @@ namespace { PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); -// constexpr PixelDataDim dim = dim1; -// PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(dim.maxDimSize())); // Initialize CPU data structures @@ -326,7 +323,6 @@ namespace { GenInfo aprInfo(input_image.getDimension()); GenInfo giGpu(input_image.getDimension()); - // Calculate pipeline on CPU // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); @@ -344,17 +340,13 @@ namespace { // Calculate pipeline on GPU - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - cudaDeviceSynchronize(); - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - // { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); cudaDeviceSynchronize(); - // } timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result From 3c601be7ecb9354ba6853efb93256a52607d34fc Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:50:22 +0200 Subject: [PATCH 58/80] doAll() removed from Gpu pipeline --- src/algorithm/ComputeGradientCuda.cu | 7 ------- src/algorithm/ComputeGradientCuda.hpp | 1 - 2 files changed, 8 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 9c85cd0f..14d1d5d0 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -351,13 +351,6 @@ LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return imp template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} -template -void GpuProcessingTask::doAll() { - sendDataToGpu(); - processOnGpu(); - getDataFromGpu(); -} - // explicit instantiation of handled types template class GpuProcessingTask; template class GpuProcessingTask; diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 723b6181..837d29f5 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -49,7 +49,6 @@ class GpuProcessingTask { void sendDataToGpu(); LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); - void doAll(); }; #endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP From d2fd1d0f4f5afb5b72f47ed1cc49d91097004d2b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 22 Aug 2024 13:24:39 +0200 Subject: [PATCH 59/80] GPU pipeline now works for APRConverter! --- examples/Example_get_apr.h | 2 +- src/algorithm/APRConverter.hpp | 163 ++++---------- src/algorithm/ComputeGradient.hpp | 29 +++ src/algorithm/ComputeGradientCuda.cu | 77 +++++-- test/FullPipelineCudaTest.cpp | 10 +- test/LinearAccessCudaTest.cpp | 309 ++++++++++++++------------- 6 files changed, 290 insertions(+), 300 deletions(-) diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h index c1be9d2b..6d787811 100644 --- a/examples/Example_get_apr.h +++ b/examples/Example_get_apr.h @@ -30,7 +30,7 @@ struct cmdLineOptions{ bool auto_parameters = false; float Ip_th = 0; - float lambda = -1; + float lambda = 3.0; float sigma_th = 0; float rel_error = 0.1; float grad_th = 1; diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 404d2bf5..91858629 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -117,7 +117,7 @@ class APRConverter { PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors PixelData local_scale_temp2; - void applyParameters(APR& aAPR,APRParameters& aprParameters); + void applyParameters(APRParameters& aprParameters); template void computeL(APR& aAPR,PixelData& input_image); @@ -184,7 +184,7 @@ void APRConverter::get_apr_custom_grad_scale(APR& aAPR,PixelData::computeL(APR& aAPR,PixelData& input_image){ } template -void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParameters) { +void APRConverter::applyParameters(APRParameters& aprParameters) { // // Apply the main parameters // @@ -265,39 +265,7 @@ void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParame } fine_grained_timer.stop_timer(); - fine_grained_timer.start_timer("threshold"); - iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); - fine_grained_timer.stop_timer(); - - float max_th = 60000; - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - float rescaled = local_scale_temp.mesh[i]; - if (rescaled < aprParameters.sigma_th) { - rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th; - local_scale_temp.mesh[i] = rescaled; - } - } - -#ifdef HAVE_LIBTIFF - if(par.output_steps) { - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp); - } -#endif - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - if(grad_temp.mesh[i] < aprParameters.grad_th){ - grad_temp.mesh[i] = 0; - } - } + iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset); } @@ -405,7 +373,7 @@ inline bool APRConverter::get_lrf(APR &aAPR, PixelData& input_imag template inline bool APRConverter::get_ds(APR &aAPR) { - applyParameters(aAPR,par); + applyParameters(par); aAPR.parameters = par; solveForAPR(aAPR); @@ -426,104 +394,45 @@ inline bool APRConverter::get_ds(APR &aAPR) { */ template template inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input_image) { - if (!initPipelineAPR(aAPR, input_image)) return false; + if (!initPipelineAPR(aAPR, input_image)) return false; initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA"); - APRTimer t(true); - APRTimer d(true); - t.start_timer(" =========== ALL"); - { - - computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) - - ///////////////////////////////// - /// Pipeline - //////////////////////// - // offset image by factor (this is required if there are zero areas in the background with - // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow! - - if (std::is_same::value) { - bspline_offset = 100; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else if (std::is_same::value) { - bspline_offset = 5; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else { - image_temp.copyFromMesh(input_image); - } - - computation_timer.stop_timer(); - - std::vector> gpts; - - int numOfStreams = 1; - int repetitionsPerStream = 1; - - computation_timer.start_timer("compute_L"); - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - gpts.back().sendDataToGpu(); - gpts.back().processOnGpu(); - } - computation_timer.stop_timer(); - - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - computation_timer.start_timer("apply_parameters"); - // get data from previous task - gpts[c].getDataFromGpu(); - - computation_timer.stop_timer(); - - // in theory we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].sendDataToGpu(); - gpts[c].processOnGpu(); - } - - // Postprocess on CPU - std::cout << "--------- start CPU processing ---------- " << i << std::endl; - - computation_timer.start_timer("solve_for_apr"); - iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo); - - PixelData lst(local_scale_temp, true); - -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst); - } -#endif + computation_timer.start_timer("init_mem"); + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp); - } -#endif + ///////////////////////////////// + /// Pipeline + //////////////////////// + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! - iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par); + if (std::is_same::value) { + bspline_offset = 100; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else if (std::is_same::value) { + bspline_offset = 5; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else { + image_temp.copyFromMesh(input_image); + } - iPullingScheme.pulling_scheme_main(); + GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); - computation_timer.stop_timer(); + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - computation_timer.start_timer("generate_data_structures"); - generateDatastructures(aAPR); - computation_timer.stop_timer(); - } - std::cout << "Total n ENDED" << std::endl; + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; - } - t.stop_timer(); - method_timer.stop_timer(); + std::cout << "CUDA pipeline finished!\n"; return true; } @@ -565,7 +474,7 @@ inline bool APRConverter::get_apr_cpu(APR &aAPR, PixelData &input_ method_timer.stop_timer(); } - applyParameters(aAPR,par); + applyParameters(par); computation_timer.stop_timer(); @@ -597,7 +506,7 @@ template template inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_image) { // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen. // After revising a CUDA pipeline remove "#if true // " part. -#if true // #ifndef APR_USE_CUDA +#ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else return get_apr_cuda(aAPR, input_image); diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index ee5aeec8..6b682fdf 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -38,6 +38,35 @@ class ComputeGradient { template void calc_inv_bspline_z(PixelData &input); + template + void applyParameters(PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) { + threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); + + float max_th = 60000; + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + float rescaled = local_scale_temp.mesh[i]; + if (rescaled < aprParameters.sigma_th) { + rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th; + local_scale_temp.mesh[i] = rescaled; + } + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + if(grad_temp.mesh[i] < aprParameters.grad_th){ + grad_temp.mesh[i] = 0; + } + } + } + struct three_temps { float temp_1, temp_2, temp_3; }; diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 14d1d5d0..c4f0e849 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -57,6 +57,7 @@ namespace { } BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { + // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 @@ -79,8 +80,8 @@ namespace { const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); - //std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 - // << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; +// std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 +// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl; // ------- Calculating boundary conditions @@ -169,18 +170,18 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim - runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); - runBsplineXdir(cudaImage, image.getDimension(), px, aStream); - runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream); + if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); - runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); } class CurrentTime { @@ -202,6 +203,49 @@ public: }; +/** + * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. + * @param input + * @param output + * @param length - len of input/output arrays + * @param thresholdLevel + */ +template +__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { + size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) { + if (input[idx] <= thresholdLevel) { output[idx] = 0; } + } +} + +template +void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); + threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); +}; + +template +__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) { + const float max_th = 60000.0; + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx < len) { + float rescaled = data[idx]; + if (rescaled < sigmaThreshold) { + rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; + } + data[idx] = rescaled; + } +} + +template +void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); + rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax); +} + + template template class GpuProcessingTask::GpuProcessingTaskImpl { @@ -264,11 +308,11 @@ public: iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), - bc1(params.bc1.get(), params.k0, iStream), - bc2(params.bc2.get(), params.k0, iStream), - bc3(params.bc3.get(), params.k0, iStream), - bc4(params.bc4.get(), params.k0, iStream), +// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), +// bc1(params.bc1.get(), params.k0, iStream), +// bc2(params.bc2.get(), params.k0, iStream), +// bc3(params.bc3.get(), params.k0, iStream), +// bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), @@ -317,6 +361,13 @@ public: splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); + + // Apply parameters from APRConverter: + runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); + runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + // TODO: automatic parameters are not implemented for GPU pipeline (yet) + float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 8f29141b..913b7e09 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -280,11 +280,6 @@ namespace { TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) { APRTimer timer(true); - // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) - // It fails for {4,4,3} for sure and surprisingly only for mesh with blob inside... - // Investigate why it fails while it works nicely in tests above (difference must be somewhere in GpuProcessingTask). - - // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; constexpr PixelDataDim dim1{4, 4, 3}; @@ -320,6 +315,8 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; + float bspline_offset = 0; + GenInfo aprInfo(input_image.getDimension()); GenInfo giGpu(input_image.getDimension()); @@ -328,6 +325,7 @@ namespace { ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet lpcs = LocalParticleCellSet(); + ComputeGradient().applyParameters(grad_temp, local_scale_temp, local_scale_temp2, par, bspline_offset); lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); PullingScheme ps; ps.initialize_particle_cell_tree(aprInfo); @@ -341,7 +339,7 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel); gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 84cf8730..eb91e7bd 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -97,160 +97,163 @@ namespace { } -TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; -} - - -TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished -// int values[] = {0,0,0,5, 0,0,0,0}; +// TODO: There are still problems with computing of small (like 1D images in pipeline) +// belows test can be used to trigger those errors - should be fixed + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { +// // TODO: delete me after development +// // Full 'get apr' pipeline to test imp. on different stages +// // Useful during debugging and can be removed once finished +// +// // Prepare input data (image) +// int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; +// // PS input values = 5 0 0 0 0 0 0 0 +// +//// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +//// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; +// // int len = sizeof(values)/sizeof(int); - - PixelData levels(8, 1, 1, 0); - levels(5,0,0) = 1; - -// initFromZYXarray(levels, values); - std::cout << "---------------\n"; - levels.printMeshT(3, 1); - std::cout << "---------------\n"; - - GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; - - APRTimer t(false); - - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - - fillPS(ps, levels); - - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; - - ps.pulling_scheme_main(); - t.stop_timer(); - - // Useful during debugging and can be removed once finished - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - par.neighborhood_optimization = true; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - - std::cout << gi << std::endl; - auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; - prt(linearAccess.y_vec); - prt(linearAccess.xz_end_vec); - prt(linearAccess.level_xz_vec); - - LinearIterator it(linearAccess, gi); - for (int l = 0; l <= 3; l++) { - std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; - } - std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; -} +// PixelData data(len, 1, 1); +// initFromZYXarray(data, values); +// std::cout << "----- Input image:\n"; +// data.printMeshT(3, 1); +// +// // Produce APR +// APR apr; +// APRConverter aprConverter; +// aprConverter.par.rel_error = 0.1; +// aprConverter.par.lambda = 0.1; +// aprConverter.par.sigma_th = 0.0001; +// aprConverter.par.neighborhood_optimization = true; +// aprConverter.get_apr(apr, data); +// +// // Print information about APR and all particles +// std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; +// for (int l = apr.level_min(); l <= apr.level_max(); ++l) { +// std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; +// } +// std::cout << "APR particles z x y level:\n"; +// auto it = apr.iterator(); +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +// +// // Sample input +// ParticleData particleIntensities; +// particleIntensities.sample_image(apr, data); +// +// // Reconstruct image from particles +// PixelData reconstructImg; +// APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); +// std::cout << "----- Reconstructed image:"< levelImg; +// APRReconstruction::reconstruct_level(apr, levelImg); +// std::cout << "----- Image levels:" << std::endl; +// levelImg.printMeshT(3, 1); +// +// // Show intensities and levels of each particle +// std::cout << "----- Particle intensities:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// particleIntensities.fill_with_levels(apr); +// +// std::cout << "----- Particle levels:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// // Show some general information about generated APR +// double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); +// std::cout << std::endl; +// std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; +// std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +//} + + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { +// // TODO: delete me after development +// // Runs PS to test imp. on different stages +// // Useful during debugging and can be removed once finished +//// int values[] = {0,0,0,5, 0,0,0,0}; +//// int len = sizeof(values)/sizeof(int); +// +// PixelData levels(8, 1, 1, 0); +// levels(5,0,0) = 1; +// +//// initFromZYXarray(levels, values); +// std::cout << "---------------\n"; +// levels.printMeshT(3, 1); +// std::cout << "---------------\n"; +// +// GenInfo gi; +// const PixelDataDim dim = levels.getDimension(); +// std::cout << "Levels dim: " << dim << std::endl; +// gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. +// std::cout << gi << std::endl; +// +// APRTimer t(false); +// +// t.start_timer("PS1"); +// PullingScheme ps; +// ps.initialize_particle_cell_tree(gi); +// int l_max = gi.l_max - 1; +// int l_min = gi.l_min; +// std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; +// +// fillPS(ps, levels); +// +// std::cout << "---------- Filled PS tree\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "---------------\n"; +// +// ps.pulling_scheme_main(); +// t.stop_timer(); +// +// // Useful during debugging and can be removed once finished +// std::cout << "----------PS:\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "-------------\n"; +// +// LinearAccess linearAccess; +// linearAccess.genInfo = &gi; +// APRParameters par; +// par.neighborhood_optimization = true; +// linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); +// +// std::cout << gi << std::endl; +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(linearAccess.y_vec); +// prt(linearAccess.xz_end_vec); +// prt(linearAccess.level_xz_vec); +// +// LinearIterator it(linearAccess, gi); +// for (int l = 0; l <= 3; l++) { +// std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; +// } +// std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; +// +// std::cout << "===========================\n"; +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +//} // ********************************************************************************************************************* // Tests of CUDA implementation of LinearAccess From 9604c631a7aac155a310e2c58a00698e10f4b51e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 17 Mar 2025 16:17:18 +0100 Subject: [PATCH 60/80] Linear acces now is using correct cuda stream, bspline params are computed in constructor and memory for them is preallocated --- src/algorithm/ComputeGradientCuda.cu | 64 +++++++++---------- .../APR/access/LinearAccessCuda.cu | 10 +-- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index c4f0e849..e4247f36 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -134,7 +134,7 @@ namespace { }; } - auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) { + auto transferSpline(const BsplineParams &aParams, cudaStream_t aStream) { ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0, aStream); ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0, aStream); ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0, aStream); @@ -267,11 +267,13 @@ class GpuProcessingTask::GpuProcessingTaskImpl { // bspline stuff const float tolerance = 0.0001; - BsplineParams params; - ScopedCudaMemHandler bc1; - ScopedCudaMemHandler bc2; - ScopedCudaMemHandler bc3; - ScopedCudaMemHandler bc4; + std::pair cudax; + std::pair cuday; + std::pair cudaz; + BsplineParamsCuda splineCudaX; + BsplineParamsCuda splineCudaY; + BsplineParamsCuda splineCudaZ; + const size_t boundaryLen; ScopedCudaMemHandler boundary; @@ -306,19 +308,18 @@ public: iAprInfo(iCpuImage.getDimension()), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), - // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. - // Should be fixed when other parts of pipeline are ready. -// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), -// bc1(params.bc1.get(), params.k0, iStream), -// bc2(params.bc2.get(), params.k0, iStream), -// bc3(params.bc3.get(), params.k0, iStream), -// bc4(params.bc4.get(), params.k0, iStream), + cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)), + cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)), + cudaz(transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), y_vec(nullptr, iAprInfo.getSize(), iStream) { -// std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; + splineCudaX = cudax.first; + splineCudaY = cuday.first; + splineCudaZ = cudaz.first; + std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n"; // std::cout << iCpuImage << std::endl; // std::cout << iCpuLevels << std::endl; } @@ -332,47 +333,42 @@ public: } LinearAccessCudaStructs getDataFromGpu() { -// CurrentTime ct; -// uint64_t start = ct.microseconds(); -// local_scale_temp.copyD2H(); -// checkCuda(cudaStreamSynchronize(iStream)); -// std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + // TODO: Temporarily turned off here since synchronized already in computeLinearStructureCuda + // checkCuda(cudaStreamSynchronize(iStream)); + return std::move(lacs); } void processOnGpu() { - CurrentTime ct; + // image.copyH2D(); + CurrentTime ct{}; uint64_t start = ct.microseconds(); - // TODO: temporarily bspline params are generated here - // In principle this is OK and correct but would be faster (for processing series of same size images) if - // they would be calculated in constructor of GpuProcessingTaskImpl class (once). - BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); - auto cudax = transferSpline(px, iStream); - auto splineCudaX = cudax.first; - BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); - auto cuday = transferSpline(py, iStream); - auto splineCudaY = cuday.first; - BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); - auto cudaz = transferSpline(pz, iStream); - auto splineCudaZ = cudaz.first; - + CudaTimer time(false, "PIPELINE"); + time.start_timer("getgradient"); getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); + time.stop_timer(); + time.start_timer("intensity"); runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); + time.stop_timer(); + // Apply parameters from APRConverter: + time.start_timer("runs...."); runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); // TODO: automatic parameters are not implemented for GPU pipeline (yet) + time.stop_timer(); + time.start_timer("compute lev"); float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - + time.stop_timer(); computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); } diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 1a876d0e..bc410050 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -560,9 +560,9 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara { - ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); - ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); - ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size(), aStream); + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size(), aStream); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size(), aStream); GenInfoGpuAccess giga(gi, aStream); if (gi.l_max <= 2) { runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream); @@ -612,8 +612,8 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_ma { - ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); - ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size(), aStream); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size(), aStream); GenInfoGpuAccess giga(gi, aStream); if (gi.l_max <= 2) { runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream); From 9572e101a6bdf065cfd6a4e83da46d3e71643ac5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 19 Mar 2025 14:01:51 +0100 Subject: [PATCH 61/80] added error handling for bspline y-dir --- src/algorithm/ComputeGradientCuda.cu | 36 +++++++++++++++++++++++----- src/algorithm/bsplineYdir.cuh | 17 ++++--------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index e4247f36..24c50696 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -166,13 +166,21 @@ template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, + bool &isErrorDetected, ScopedCudaMemHandler& isErrorDetectedCuda, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim - if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + isErrorDetected = false; + isErrorDetectedCuda.copyH2D(); + if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream); if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream); if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + isErrorDetectedCuda.copyD2H(); + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " + "try squashing the input image to a narrower range or use APRConverter"); + } runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); @@ -273,6 +281,8 @@ class GpuProcessingTask::GpuProcessingTaskImpl { BsplineParamsCuda splineCudaX; BsplineParamsCuda splineCudaY; BsplineParamsCuda splineCudaZ; + bool isErrorDetected; + ScopedCudaMemHandler isErrorDetectedCuda; const size_t boundaryLen; ScopedCudaMemHandler boundary; @@ -311,6 +321,7 @@ public: cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)), cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)), cudaz(transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)), + isErrorDetectedCuda(&isErrorDetected, 1, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), @@ -347,7 +358,7 @@ public: CudaTimer time(false, "PIPELINE"); time.start_timer("getgradient"); getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), - splineCudaX, splineCudaY, splineCudaZ, boundary.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); time.stop_timer(); time.start_timer("intensity"); @@ -420,6 +431,8 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera ScopedCudaMemHandler, D2H | H2D> cudaInput(input, aStream); APRTimer timer(false); + bool isErrorDetected = false; + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); @@ -427,7 +440,7 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); + runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), error.get(), aStream); } if (flags & BSPLINE_X_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); @@ -441,6 +454,14 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera auto splineCuda = cuda.first; runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } + + waitForCuda(); + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " + "try squashing the input image to a narrower range or use APRConverter"); + } + timer.stop_timer(); } @@ -510,9 +531,12 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); auto cudaz = transferSpline(pz, aStream); auto splineCudaZ = cudaz.first; - - getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream); + bool isErrorDetected = false; + { + ScopedCudaMemHandler isErrorDetectedCuda(&isErrorDetected, 1, aStream); + getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, bspline_offset, par, aStream); + } } void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index e9905b64..c49391f9 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -242,22 +242,13 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplinePara * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) { +void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, bool *error, cudaStream_t aStream) { dim3 threadsPerBlock(numOfThreads); dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); - bool isErrorDetected = false; - { - ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); - sharedMemSize = numOfThreads * blockWidth * sizeof(float); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); - } - - if (isErrorDetected) { - throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - " - "try squashing the input image to a narrower range or use APRConverter"); - } + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error); + sharedMemSize = numOfThreads * blockWidth * sizeof(float); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error); } #endif From 514c03c959df21fa867f17b3b3864664777b5022 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 19 Mar 2025 16:47:09 +0100 Subject: [PATCH 62/80] Removed (most of the) warnings --- src/misc/CudaMemory.cuh | 19 ++----------------- src/misc/CudaTools.cuh | 7 +++---- test/PullingSchemeCudaTest.cpp | 9 --------- 3 files changed, 5 insertions(+), 30 deletions(-) diff --git a/src/misc/CudaMemory.cuh b/src/misc/CudaMemory.cuh index fbe125e9..6859ad5d 100644 --- a/src/misc/CudaMemory.cuh +++ b/src/misc/CudaMemory.cuh @@ -9,23 +9,8 @@ #include #include -#include - - -// TODO: this method is duplicated in CudaTools.cuh -// Somehow including it here break compilation - fix it please. -#define checkCuda(ans) { cudaAssert2((ans), __FILE__, __LINE__); } -inline void cudaAssert2(cudaError_t code, const char *file, int line, bool abort=true) -{ -#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) - if (code != cudaSuccess) - { - fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); - assert(code == cudaSuccess); // If debugging it helps to see call tree somehow - if (abort) exit(code); - } -#endif -} +#include "misc/CudaTools.cuh" + inline void* getPinnedMemory(size_t aNumOfBytes) { void *memory = nullptr; diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 10e4cb73..351c0009 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -5,15 +5,12 @@ #ifndef LIBAPR_CUDATOOLS_HPP #define LIBAPR_CUDATOOLS_HPP - #include #include #include #include #include - -#include "data_structures/Mesh/PixelData.hpp" - +#include #define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) @@ -112,6 +109,8 @@ enum CopyDir : CopyDirType { INVALID = 4 // Just wrong/last value keeper for validating settings }; +template +class PixelData; /** * Checks if provided type is a PixelData container diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index bd24156e..157ce7ec 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -147,9 +147,6 @@ TEST(PullingSchemeTest, OVPCCUDA_Ydir) { const PixelDataDim dim = levels.getDimension(); gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir - int levelMax = gi.l_max - 1; - int levelMin = gi.l_min; - // Initialize all needed objects APRTimer t(false); @@ -185,9 +182,6 @@ TEST(PullingSchemeTest, OVPCCUDA_Xdir) { const PixelDataDim dim = levels.getDimension(); gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir - int levelMax = gi.l_max - 1; - int levelMin = gi.l_min; - // Initialize all needed objects APRTimer t(false); @@ -223,9 +217,6 @@ TEST(PullingSchemeTest, OVPCCUDA_Zdir) { const PixelDataDim dim = levels.getDimension(); gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir - int levelMax = gi.l_max - 1; - int levelMin = gi.l_min; - // Initialize all needed objects APRTimer t(false); From 005a4ba53354665b3a3087d3de893a6b924cfc68 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 25 Jun 2025 10:29:09 +0200 Subject: [PATCH 63/80] added error handling for bspline x-dir, other steps temporarily blocked --- CMakeLists.txt | 3 +- src/algorithm/APRConverter.hpp | 118 ++++++++++++++++++++++++++- src/algorithm/APRParameters.hpp | 1 + src/algorithm/ComputeGradientCuda.cu | 76 ++++++++--------- src/algorithm/LocalIntensityScale.cu | 2 +- src/algorithm/bsplineXdir.cuh | 15 +--- 6 files changed, 161 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cf047e0..13d764b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,9 +209,10 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON) if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") + set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA") set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 91858629..5c3cbb06 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -9,6 +9,7 @@ #ifndef __APR_CONVERTER_HPP__ #define __APR_CONVERTER_HPP__ +#include #include #include "AutoParameters.hpp" @@ -74,6 +75,8 @@ class APRConverter { #ifdef APR_USE_CUDA template bool get_apr_cuda(APR &aAPR, PixelData &input_image); + template + bool get_apr_cuda_streams(APR &aAPR, PixelData &input_image); #endif bool verbose = true; @@ -438,6 +441,118 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input } #endif +#ifdef APR_USE_CUDA +/** + * Implementation of pipeline for GPU/CUDA and multiple streams + * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be. + * Finally, it should be able to process incoming stream of data (sequence of images). + * + * @param aAPR - the APR data structure + * @param input_image - input image + */ +template template +inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData& input_image) { + + if (!initPipelineAPR(aAPR, input_image)) return false; + + initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); + + computation_timer.start_timer("init_mem"); + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) + + ///////////////////////////////// + /// Pipeline + //////////////////////// + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! + + if (std::is_same::value) { + bspline_offset = 100; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else if (std::is_same::value) { + bspline_offset = 5; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else { + image_temp.copyFromMesh(input_image); + } + + + + constexpr int numOfStreams = 3; + constexpr int repetitionsPerStream = 3; // + APRTimer ttt(true); + ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); + { + std::vector> gpts; + + //std::vector> gpts_futures; gpts_futures.resize(numOfStreams); + for (int i = 0; i < numOfStreams; ++i) { + gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + } + + APRTimer t(true); + t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); + { + + APRTimer tt(false); + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + tt.start_timer("SEND"); + gpts[i].sendDataToGpu(); + tt.stop_timer(); + // std::cout << "Send " << i << std::endl; + // gpts.back().processOnGpu(); + // std::cout << "Proc " << i << std::endl; + } + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); + tt.start_timer("Process"); + gpts[i].processOnGpu(); + tt.stop_timer(); + // std::cout << "Proc " << i << std::endl; + } + std::cout << "=========" << std::endl; + + for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { + int c = i % numOfStreams; + + // get data from previous task + // gpts_futures[c].get(); + auto linearAccessGpu = gpts[c].getDataFromGpu(); + // std::cout << "Get " << c << std::endl; + + // in theory, we get new data and send them to task + if (i < numOfStreams * (repetitionsPerStream - 1)) { + gpts[c].sendDataToGpu(); + // std::cout << "Send " << c << std::endl; + gpts[c].processOnGpu(); + // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); + // std::cout << "Proc " << c << std::endl; + } + + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; + + // std::cout << "CUDA pipeline finished!\n"; + } + } + auto allT = t.stop_timer(); + std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; + } + auto allT = ttt.stop_timer(); + std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; + + return false; //TODO: change it back to true +} +#endif /** * Implementation of pipeline for CPU @@ -509,7 +624,8 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag #ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else - return get_apr_cuda(aAPR, input_image); + // return get_apr_cuda(aAPR, input_image); + return get_apr_cuda_streams(aAPR, input_image); #endif } diff --git a/src/algorithm/APRParameters.hpp b/src/algorithm/APRParameters.hpp index f99a151b..4cd02c05 100644 --- a/src/algorithm/APRParameters.hpp +++ b/src/algorithm/APRParameters.hpp @@ -57,6 +57,7 @@ class APRParameters { os << "sigma_th_max=" << obj.sigma_th_max << "\n"; os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n"; os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n"; + os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n"; os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n"; return os; diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 24c50696..75d83d72 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -174,22 +174,22 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc isErrorDetected = false; isErrorDetectedCuda.copyH2D(); if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream); - if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream); - if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); - isErrorDetectedCuda.copyD2H(); - if (isErrorDetected) { - throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " - "try squashing the input image to a narrower range or use APRConverter"); - } - - - runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); - - runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); - - if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, isErrorDetectedCuda.get(), aStream); + // if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + // isErrorDetectedCuda.copyD2H(); + // if (isErrorDetected) { + // throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " + // "try squashing the input image to a narrower range or use APRConverter"); + // } + // + // + // runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); + // + // runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); + // + // if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + // if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + // if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); } class CurrentTime { @@ -361,27 +361,27 @@ public: splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); time.stop_timer(); - time.start_timer("intensity"); - runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); - time.stop_timer(); - - - // Apply parameters from APRConverter: - time.start_timer("runs...."); - runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); - runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); - runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); - // TODO: automatic parameters are not implemented for GPU pipeline (yet) - time.stop_timer(); - - time.start_timer("compute lev"); - float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); - float level_factor = pow(2, iMaxLevel) * min_dim; - const float mult_const = level_factor/iParameters.rel_error; - runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - time.stop_timer(); - computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); - computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); + // time.start_timer("intensity"); + // runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); + // time.stop_timer(); + // + // + // // Apply parameters from APRConverter: + // time.start_timer("runs...."); + // runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); + // runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + // runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + // // TODO: automatic parameters are not implemented for GPU pipeline (yet) + // time.stop_timer(); + // + // time.start_timer("compute lev"); + // float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); + // float level_factor = pow(2, iMaxLevel) * min_dim; + // const float mult_const = level_factor/iParameters.rel_error; + // runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); + // time.stop_timer(); + // computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); + // computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); } ~GpuProcessingTaskImpl() { @@ -446,7 +446,7 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; - runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); + runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, error.get(), aStream); } if (flags & BSPLINE_Z_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 1593b5ab..b62d9f26 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -503,7 +503,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete bool constant_scale = false; if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) { - // include the case where the local intensity scale doesn't make sense due to the image being to small. + // include the case where the local intensity scale doesn't make sense due to the image being too small. // (This is for just edge cases and sanity checking) constant_scale = true; } diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index 1df52a80..e82132c6 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -127,24 +127,13 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, boo * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { +void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, bool *error, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y, (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z); - // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel - // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. - bool isErrorDetected = false; - { - ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); - bsplineXdir <<>>(cudaImage, dim, p, error.get()); - } - - if (isErrorDetected) { - throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - " - "try squashing the input image to a narrower range or use APRConverter"); - } + bsplineXdir <<>>(cudaImage, dim, p, error); } #endif From 5534860b9c4729c824c9c6d0c2a4e381e083b66f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 25 Jun 2025 11:06:14 +0200 Subject: [PATCH 64/80] added error handling for bspline z-dir --- src/algorithm/bsplineZdir.cuh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index 43550ff8..a2c76053 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -129,7 +129,7 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, BsplineParamsCuda p, boo * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { +void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, bool *error, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, @@ -137,16 +137,7 @@ void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaSt (dim.x + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. - bool isErrorDetected = false; - { - ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); - bsplineZdir <<>> (cudaImage, dim, p, error.get()); - } - - if (isErrorDetected) { - throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineZdir - " - "try squashing the input image to a narrower range or use APRConverter"); - } + bsplineZdir <<>> (cudaImage, dim, p, error); } #endif From 1b54cd058a23b605ab42516d12807ff565ab9406 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 1 Aug 2025 16:16:01 +0200 Subject: [PATCH 65/80] Stream operations on GPU are working now as expected. All tests are fixed. Still APRConverter for streams must be fixed/improved since there is a draft code. --- src/algorithm/APRConverter.hpp | 225 ++++++++++----- src/algorithm/APRParameters.hpp | 7 +- src/algorithm/ComputeGradient.hpp | 2 +- src/algorithm/ComputeGradientCuda.cu | 256 +++++++++++++----- src/algorithm/LocalIntensityScale.cu | 36 ++- src/algorithm/LocalIntensityScale.cuh | 2 +- src/algorithm/bsplineYdir.cuh | 2 +- src/algorithm/invBspline.cuh | 5 +- .../APR/access/GenInfoGpuAccess.cuh | 112 ++++++++ .../APR/access/LinearAccessCuda.cu | 202 ++++---------- .../APR/access/LinearAccessCuda.hpp | 5 +- src/data_structures/Mesh/paddPixelData.cuh | 4 +- src/misc/CudaTools.cuh | 2 +- test/APRTest.cpp | 36 +-- test/FullPipelineCudaTest.cpp | 51 ++++ 15 files changed, 619 insertions(+), 328 deletions(-) create mode 100644 src/data_structures/APR/access/GenInfoGpuAccess.cuh diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 5c3cbb06..6141887c 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -182,7 +182,7 @@ void APRConverter::get_apr_custom_grad_scale(APR& aAPR,PixelData::get_apr_cuda(APR &aAPR, PixelData& input // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) // Warning both of these could result in over-flow! - if (std::is_same::value) { - bspline_offset = 100; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else if (std::is_same::value) { - bspline_offset = 5; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else { + if (std::is_floating_point::value) { image_temp.copyFromMesh(input_image); + } else { + bspline_offset = compute_bspline_offset(input_image, par.lambda); + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); } GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); + // std::cout << "after gpt \n"; gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); @@ -479,76 +477,169 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData Whole GPU pipeline with repetitions and MEMORY"); - { - std::vector> gpts; - - //std::vector> gpts_futures; gpts_futures.resize(numOfStreams); - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - } + constexpr int numOfStreams = 3; // number of streams to use for parallel processing + constexpr int repetitionsPerStream = 15; // number of repetitions per stream to simulate processing of multiple images + bool useThreads = true; - APRTimer t(true); - t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); + if (useThreads) { + std::cout << "\n!!! USING THREADS !!!\n\n"; + APRTimer ttt(true); + std::cout << ">>>>>>>>>>> START\n"; + ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); { + APRTimer t(true); + std::vector> gpts; - APRTimer tt(false); - // Create streams and send initial task to do + t.start_timer("Creating GPTS"); + std::vector> gpts_futures; gpts_futures.resize(numOfStreams); for (int i = 0; i < numOfStreams; ++i) { - // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - tt.start_timer("SEND"); - gpts[i].sendDataToGpu(); - tt.stop_timer(); - // std::cout << "Send " << i << std::endl; - // gpts.back().processOnGpu(); - // std::cout << "Proc " << i << std::endl; + gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); } - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); - tt.start_timer("Process"); - gpts[i].processOnGpu(); - tt.stop_timer(); - // std::cout << "Proc " << i << std::endl; - } - std::cout << "=========" << std::endl; - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - // get data from previous task - // gpts_futures[c].get(); - auto linearAccessGpu = gpts[c].getDataFromGpu(); - // std::cout << "Get " << c << std::endl; - - // in theory, we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].sendDataToGpu(); - // std::cout << "Send " << c << std::endl; - gpts[c].processOnGpu(); - // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); - // std::cout << "Proc " << c << std::endl; + t.stop_timer(); + + t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); + { + APRTimer tt(false); + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + tt.start_timer("SEND"); + // gpts[i].sendDataToGpu(); + // gpts[i].processOnGpu(); + tt.stop_timer(); + // std::cout << "Send " << i << std::endl; + // gpts.back().processOnGpu(); + // std::cout << "Proc " << i << std::endl; } + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); + // tt.start_timer("Process"); + // gpts[i].processOnGpu(); + // tt.stop_timer(); + // std::cout << "Proc " << i << std::endl; + } + std::cout << "=========" << std::endl; + + for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { + int c = i % numOfStreams; - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + // get data from previous task + gpts_futures[c].get(); + auto linearAccessGpu = gpts[c].getDataFromGpu(); - // generateDatastructures(aAPR) for linearAcceess for CUDA - aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); - aAPR.apr_initialized = true; + // in theory, we get new data and send them to task + if (i < numOfStreams * (repetitionsPerStream - 1)) { + // gpts[c].sendDataToGpu(); + // std::cout << "Send " << c << std::endl; + // gpts[c].processOnGpu(); + gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); + // std::cout << "Proc " << c << std::endl; + } - // std::cout << "CUDA pipeline finished!\n"; + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; + + // std::cout << "CUDA pipeline finished!\n"; + } + // cudaDeviceSynchronize(); + } + auto allT = t.stop_timer(); + std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; + std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n"; + } + auto allT = ttt.stop_timer(); + float tpi = allT / (numOfStreams*repetitionsPerStream); + std::cout << "Time per image: " << tpi << " seconds\n"; + std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n"; + std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n"; + + + std::cout << "<<<<<<<<<<<< STOP\n"; + } + else { + APRTimer ttt(true); + std::cout << ">>>>>>>>>>> START\n"; + ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); + { + APRTimer t(true); + std::vector> gpts; + + t.start_timer("Creating GPTS"); + //std::vector> gpts_futures; gpts_futures.resize(numOfStreams); + for (int i = 0; i < numOfStreams; ++i) { + gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + } + // cudaDeviceSynchronize(); + t.stop_timer(); + + t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); + { + + APRTimer tt(false); + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + tt.start_timer("SEND"); + gpts[i].sendDataToGpu(); + gpts[i].processOnGpu(); + tt.stop_timer(); + // std::cout << "Send " << i << std::endl; + // gpts.back().processOnGpu(); + // std::cout << "Proc " << i << std::endl; + } + // Create streams and send initial task to do + for (int i = 0; i < numOfStreams; ++i) { + // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); + tt.start_timer("Process"); + // gpts[i].processOnGpu(); + tt.stop_timer(); + // std::cout << "Proc " << i << std::endl; + } + std::cout << "=========" << std::endl; + + for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { + int c = i % numOfStreams; + + // get data from previous task + // gpts_futures[c].get(); + auto linearAccessGpu = gpts[c].getDataFromGpu(); + // std::cout << "Get " << c << std::endl; + + // in theory, we get new data and send them to task + if (i < numOfStreams * (repetitionsPerStream - 1)) { + gpts[c].sendDataToGpu(); + // std::cout << "Send " << c << std::endl; + gpts[c].processOnGpu(); + // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); + // std::cout << "Proc " << c << std::endl; + } + + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; + + // std::cout << "CUDA pipeline finished!\n"; + } + // cudaDeviceSynchronize(); } + auto allT = t.stop_timer(); + std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; } - auto allT = t.stop_timer(); + auto allT = ttt.stop_timer(); std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; + std::cout << "<<<<<<<<<<<< STOP\n"; } - auto allT = ttt.stop_timer(); - std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; + return false; //TODO: change it back to true } @@ -624,8 +715,8 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag #ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else - // return get_apr_cuda(aAPR, input_image); - return get_apr_cuda_streams(aAPR, input_image); + return get_apr_cuda(aAPR, input_image); + // return get_apr_cuda_streams(aAPR, input_image); #endif } diff --git a/src/algorithm/APRParameters.hpp b/src/algorithm/APRParameters.hpp index 4cd02c05..2ff0ec8e 100644 --- a/src/algorithm/APRParameters.hpp +++ b/src/algorithm/APRParameters.hpp @@ -55,11 +55,16 @@ class APRParameters { os << "rel_error=" << obj.rel_error << "\n"; os << "sigma_th=" << obj.sigma_th << "\n"; os << "sigma_th_max=" << obj.sigma_th_max << "\n"; + os << "grad_th=" << obj.grad_th << "\n"; os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n"; + os << "reflect_bc_lis=" << (obj.reflect_bc_lis ? "true" : "false") << "\n"; + os << "check_input=" << (obj.check_input ? "true" : "false") << "\n"; + os << "swap_dimensions=" << (obj.swap_dimensions ? "true" : "false") << "\n"; os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n"; os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n"; os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n"; - + os << "dx/dy/dz=" << obj.dx << "/" << obj.dy << "/" << obj.dz << "\n"; + os << "psfx/psfy/psfz=" << obj.psfx << "/" << obj.psfy << "/" << obj.psfz << "\n"; return os; } diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 6b682fdf..46eb1f37 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -364,7 +364,7 @@ round(float val, size_t &errCount) { if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { errCount++; - std::cout << val << " " << (float)std::numeric_limits::min() << " " << (float)std::numeric_limits::max() << std::endl; + // std::cout << val << " " << (float)std::numeric_limits::min() << " " << (float)std::numeric_limits::max() << std::endl; } return val; } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 75d83d72..dc55207d 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -6,6 +6,9 @@ #include #include "ComputeGradientCuda.hpp" + +#include + #include "APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/Mesh/downsample.cuh" @@ -24,6 +27,7 @@ #include "bsplineYdir.cuh" #include "bsplineZdir.cuh" +#include "data_structures/APR/access/GenInfoGpuAccess.cuh" namespace { @@ -58,6 +62,10 @@ namespace { BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { + // TODO: for lambda == 0 this function should return empty BsplineParams, for now changing lambda + // to generate anything (if lambda would stay 0 we get huge vectors out of range). + if (lambda == 0) lambda = 0.1; + // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 @@ -80,8 +88,8 @@ namespace { const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); -// std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 -// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl; + // std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 + // << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl; // ------- Calculating boundary conditions @@ -168,28 +176,33 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, bool &isErrorDetected, ScopedCudaMemHandler& isErrorDetectedCuda, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim - isErrorDetected = false; - isErrorDetectedCuda.copyH2D(); - if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream); - if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, isErrorDetectedCuda.get(), aStream); - // if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); - // isErrorDetectedCuda.copyD2H(); - // if (isErrorDetected) { - // throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " - // "try squashing the input image to a narrower range or use APRConverter"); - // } - // - // - // runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); - // - // runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); - // - // if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - // if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - // if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + // TODO: (APRstreams) isErrorDetected should be handled differently, in current state it blocks streams from + // running in parallel + if (par.lambda > 0) { + isErrorDetected = false; + isErrorDetectedCuda.copyH2D(); + + if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream); + if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, isErrorDetectedCuda.get(), aStream); + if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, isErrorDetectedCuda.get(), aStream); + + isErrorDetectedCuda.copyD2H(); + checkCuda(cudaStreamSynchronize(aStream)); + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - " + "try squashing the input image to a narrower range or use APRConverter"); + } + } + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); + runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); + + if (par.lambda > 0) { + if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + } } class CurrentTime { @@ -233,6 +246,28 @@ void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, s threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); }; +/** + * Thresholds output basing on input values. When input is < thresholdLevel then output is set to 0 and is not changed otherwise. + * @param input + * @param output + * @param length - len of input/output arrays + * @param thresholdLevel + */ +template +__global__ void thresholdOpen(const T *input, S *output, size_t length, float thresholdLevel) { + size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) { + if (input[idx] < thresholdLevel) { output[idx] = 0; } + } +} + +template +void runThresholdOpen(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); + thresholdOpen<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); +}; + template __global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) { const float max_th = 60000.0; @@ -253,11 +288,39 @@ void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cu rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax); } +class CudaStream { + cudaStream_t iStream; + + /** + * @return newly created stream + */ + cudaStream_t getStream() { + cudaStream_t stream; + cudaStreamCreate(&stream); + return stream; + } + +public: + CudaStream() { + iStream = getStream(); + } + + ~CudaStream() { + cudaStreamDestroy(iStream); + } + + cudaStream_t get() const { + return iStream; + } +}; template template class GpuProcessingTask::GpuProcessingTaskImpl { + CudaStream cudaStream; + const cudaStream_t iStream; + // input data const PixelData &iCpuImage; PixelData &iCpuLevels; @@ -267,12 +330,12 @@ class GpuProcessingTask::GpuProcessingTaskImpl { int iMaxLevel; // cuda stuff - memory and stream to be used - const cudaStream_t iStream; ScopedCudaMemHandler, JUST_ALLOC> image; ScopedCudaMemHandler, JUST_ALLOC> gradient; ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp; ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp2; + // bspline stuff const float tolerance = 0.0001; std::pair cudax; @@ -281,7 +344,10 @@ class GpuProcessingTask::GpuProcessingTaskImpl { BsplineParamsCuda splineCudaX; BsplineParamsCuda splineCudaY; BsplineParamsCuda splineCudaZ; - bool isErrorDetected; + + + // bool isErrorDetected; + VectorData isErrorDetectedPinned; ScopedCudaMemHandler isErrorDetectedCuda; const size_t boundaryLen; @@ -289,17 +355,22 @@ class GpuProcessingTask::GpuProcessingTaskImpl { ParticleCellTreeCuda pctc; - ScopedCudaMemHandler y_vec; // for LinearAccess + ScopedCudaMemHandler y_vec_cuda; // for LinearAccess LinearAccessCudaStructs lacs; - /** - * @return newly created stream - */ - cudaStream_t getStream() { - cudaStream_t stream; - cudaStreamCreate(&stream); - return stream; - } + // Padded memory for local_scale_temp and local_scale_temp2 + ScopedCudaMemHandler lstPadded; + ScopedCudaMemHandler lst2Padded; + + + // Structures used by computeLinearStructureCuda + VectorData xz_end_vec; + VectorData level_xz_vec; + VectorData y_vec; + ScopedCudaMemHandler xz_end_vec_cuda; //(xz_end_vec.data(), xz_end_vec.size(), aStream); + ScopedCudaMemHandler level_xz_vec_cuda; //(level_xz_vec.data(), level_xz_vec.size(), aStream); + GenInfoGpuAccess giga; + uint64_t counter_total = 1; public: @@ -309,7 +380,7 @@ public: GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : iCpuImage(inputImage), iCpuLevels(levels), - iStream(getStream()), + iStream(cudaStream.get()), image (inputImage, iStream), gradient (levels, iStream), local_scale_temp (levels, iStream), @@ -321,11 +392,16 @@ public: cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)), cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)), cudaz(transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)), - isErrorDetectedCuda(&isErrorDetected, 1, iStream), + isErrorDetectedPinned(true), + isErrorDetectedCuda(nullptr, 1, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), - y_vec(nullptr, iAprInfo.getSize(), iStream) + y_vec_cuda(nullptr, iAprInfo.getSize(), iStream), + xz_end_vec(true), + level_xz_vec(true), + y_vec(true), + giga(iAprInfo, iStream) { splineCudaX = cudax.first; splineCudaY = cuday.first; @@ -333,61 +409,95 @@ public: std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n"; // std::cout << iCpuImage << std::endl; // std::cout << iCpuLevels << std::endl; + + // In LIS we have: var_win[0,1,2] = maximum 3 var_win[3,4,5] = maximum 6 + // so maximum paddSize is 6 6 6 + PixelDataDim maxPaddSize(6, 6, 6); + PixelDataDim paddedImageSize = levels.getDimension() + maxPaddSize + maxPaddSize; + lstPadded.initialize(nullptr, paddedImageSize.size(), iStream); + lst2Padded.initialize(nullptr, paddedImageSize.size(), iStream); + + + // initialize_xz_linear() - CPU impl. + counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(iAprInfo.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= iAprInfo.l_max; ++i) { + counter_total += iAprInfo.x_num[i] * iAprInfo.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + // std::cout << "----------- iAprInfo.getSize() = " << iAprInfo.getSize() << std::endl; + y_vec.resize(iAprInfo.getSize()); // resize it to worst case -> same number particles as pixels in input image + // std::cout << "----------- iAprInfo.getSize() = " << iAprInfo.getSize() << std::endl; + xz_end_vec_cuda.initialize(xz_end_vec.data(), xz_end_vec.size(), iStream); + level_xz_vec_cuda.initialize(level_xz_vec.data(), level_xz_vec.size(), iStream); + + isErrorDetectedPinned.resize(1); + isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream); } void sendDataToGpu() { -// CurrentTime ct; -// uint64_t start = ct.microseconds(); - image.copyH2D(); -// checkCuda(cudaStreamSynchronize(iStream)); -// std::cout << "SEND time: " << ct.microseconds() - start << std::endl; + // sends data in processOnGpu() + // in multi-stream implementation it is done in threads so is not blocking current operations. } LinearAccessCudaStructs getDataFromGpu() { - // TODO: Temporarily turned off here since synchronized already in computeLinearStructureCuda - // checkCuda(cudaStreamSynchronize(iStream)); - return std::move(lacs); } void processOnGpu() { - // image.copyH2D(); + image.copyH2D(); CurrentTime ct{}; uint64_t start = ct.microseconds(); CudaTimer time(false, "PIPELINE"); time.start_timer("getgradient"); getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), - splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); time.stop_timer(); - // time.start_timer("intensity"); - // runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); - // time.stop_timer(); - // - // - // // Apply parameters from APRConverter: - // time.start_timer("runs...."); - // runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); - // runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); - // runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); - // // TODO: automatic parameters are not implemented for GPU pipeline (yet) - // time.stop_timer(); - // - // time.start_timer("compute lev"); - // float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); - // float level_factor = pow(2, iMaxLevel) * min_dim; - // const float mult_const = level_factor/iParameters.rel_error; - // runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - // time.stop_timer(); - // computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); - // computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); - } + time.start_timer("intensity"); + runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream); + time.stop_timer(); - ~GpuProcessingTaskImpl() { - cudaStreamDestroy(iStream); -// std::cout << "\n============== ~GpuProcessingTaskImpl ===================\n\n"; + // Apply parameters from APRConverter: + time.start_timer("runs...."); + runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); + runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + runThresholdOpen(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + // TODO: automatic parameters are not implemented for GPU pipeline (yet) + time.stop_timer(); + + time.start_timer("compute lev"); + float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); + float level_factor = pow(2, iMaxLevel) * min_dim; + const float mult_const = level_factor/iParameters.rel_error; + runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); + time.stop_timer(); + computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); + + + level_xz_vec_cuda.copyH2D(); + iAprInfo.total_number_particles = 0; // reset total_number_particles to 0 + giga.copyHtoD(); + computeLinearStructureCuda(y_vec_cuda.get(), xz_end_vec_cuda.get(), level_xz_vec_cuda.get(), pctc, iAprInfo, giga, iParameters, counter_total, iStream); + + xz_end_vec_cuda.copyD2H(); + + // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) + y_vec.resize(iAprInfo.total_number_particles); + + checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream)); + checkCuda(cudaStreamSynchronize(iStream)); + + // Prepare CPU structures + lacs.xz_end_vec.copy(xz_end_vec); + lacs.level_xz_vec.copy(level_xz_vec); + lacs.y_vec.copy(y_vec); } + + ~GpuProcessingTaskImpl() {} }; template @@ -410,6 +520,8 @@ template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} // explicit instantiation of handled types +template class GpuProcessingTask; +template class GpuProcessingTask; template class GpuProcessingTask; template class GpuProcessingTask; @@ -452,7 +564,7 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; - runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); + runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, error.get(), aStream); } waitForCuda(); diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index b62d9f26..ec2eee51 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -486,7 +486,7 @@ void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) { } template -void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream) { +void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream) { float var_rescale; std::vector var_win; auto lis = LocalIntensityScale(); @@ -511,29 +511,18 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete PixelDataDim imageSize = image.getDimension(); if (!constant_scale) { - CudaMemoryUniquePtr paddedImage; - CudaMemoryUniquePtr paddedTemp; PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension - S *ci = cudaImage; S *ct = cudaTemp; PixelDataDim dim = image.getDimension(); if (par.reflect_bc_lis) { - // padd - S *mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedImage.reset(mem); - mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedTemp.reset(mem); - - runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); - runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); - - ci = paddedImage.get(); - ct = paddedTemp.get(); + runPaddPixels(cudaImage, lstPadded, imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, lst2Padded, imageSize, paddedImageSize, paddSize, aStream); + + ci = lstPadded; //paddedImage.get(); + ct = lst2Padded; //paddedTemp.get(); dim = paddedImageSize; } @@ -555,7 +544,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete } } -template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); +template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, float*, float*, cudaStream_t); @@ -582,6 +571,15 @@ void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRPa ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); ScopedCudaMemHandler, D2H> cudaTemp(temp, aStream); - runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), aStream); + // Padded memory for local_scale_temp and local_scale_temp2 + ScopedCudaMemHandler lstPadded; + ScopedCudaMemHandler lst2Padded; + + PixelDataDim maxPaddSize(6, 6, 6); + PixelDataDim paddedImageSize = image.getDimension() + maxPaddSize + maxPaddSize; + lstPadded.initialize(nullptr, paddedImageSize.size(), aStream); + lst2Padded.initialize(nullptr, paddedImageSize.size(), aStream); + + runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), lstPadded.get(), lst2Padded.get(), aStream); } template void getLocalIntensityScale(PixelData&, PixelData&, const APRParameters&); diff --git a/src/algorithm/LocalIntensityScale.cuh b/src/algorithm/LocalIntensityScale.cuh index f48dcc20..4a707d58 100644 --- a/src/algorithm/LocalIntensityScale.cuh +++ b/src/algorithm/LocalIntensityScale.cuh @@ -5,6 +5,6 @@ #include "algorithm/APRParameters.hpp" template -void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream); +void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream); #endif \ No newline at end of file diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index c49391f9..1b6ad3db 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -138,7 +138,7 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCud } constexpr int blockWidth = 32; -constexpr int numOfThreads = 32; +constexpr int numOfThreads = 64; extern __shared__ char sharedMemProcess[]; template __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index 7c27d853..85f585cf 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -39,6 +39,7 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu template void runInvBsplineYdir(T* cudaInput, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { + // Maximum numOfWorkers is 32 since for y-direction __shfl_sync is used which works with 1 warp (32 threads) only. constexpr int numOfWorkers = 32; dim3 threadsPerBlock(1, numOfWorkers, 1); dim3 numBlocks((x_num + threadsPerBlock.x - 1) / threadsPerBlock.x, @@ -77,7 +78,7 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu template void runInvBsplineXdir(T* cudaInput, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { - constexpr int numOfWorkers = 32; + constexpr int numOfWorkers = 64; dim3 threadsPerBlock(1, numOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, @@ -115,7 +116,7 @@ __global__ void invBsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_nu template void runInvBsplineZdir(T* cudaInput, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { - constexpr int numOfWorkers = 32; + constexpr int numOfWorkers = 64; dim3 threadsPerBlock(1, numOfWorkers, 1); dim3 numBlocks((x_num + threadsPerBlock.x - 1) / threadsPerBlock.x, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, diff --git a/src/data_structures/APR/access/GenInfoGpuAccess.cuh b/src/data_structures/APR/access/GenInfoGpuAccess.cuh new file mode 100644 index 00000000..3758dc9e --- /dev/null +++ b/src/data_structures/APR/access/GenInfoGpuAccess.cuh @@ -0,0 +1,112 @@ +#ifndef GEN_INFO_GPU_ACCESS_CUH +#define GEN_INFO_GPU_ACCESS_CUH + +// CUDA version of GenInfo structure +typedef struct GenInfoCuda_t { + int l_min; + int l_max; + + int *org_dims; // fixed size: [3] + + uint8_t number_dimensions; + + int *x_num; + int *y_num; + int *z_num; + + // this differs from original GenInfo structure + // since we need to be able to send data back from GPU to CPU + uint64_t *total_number_particles; + + int *level_size; + + uint64_t get_total_number_particles() const { return *total_number_particles; } + + __device__ int level_max() const { return l_max; } + __device__ int level_min() const { return l_min; } + +} GenInfoCuda; + +// ----------------------------- + +/* + * Class for easy transfering to/from GPU of GenInfo structure. + */ +class GenInfoGpuAccess { + GenInfo &gi; + + cudaStream_t iStream; + + ScopedCudaMemHandler org_dims; + ScopedCudaMemHandler x_num; + ScopedCudaMemHandler y_num; + ScopedCudaMemHandler z_num; + + // total_number_particles_pinned is used as "middle" variable for transfering value to/from GPU + // The reason behind this is that original GenInfo structure has total_number_particles as unpinned memory. + // When transfering data is causes to all streams synchronize. To avoid that we do the following: + // GPU.total_number_particles -> total_number_particles_pinned -> CPU.total_number_particles + // and opposite direction when copying into GPU. + // In that way we do not 'break' streams. + VectorData total_number_particles_pinned; + ScopedCudaMemHandler total_number_particles; + + ScopedCudaMemHandler level_size; + + +public: + GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) : + gi(genInfo), + iStream(cudaStream), + org_dims(gi.org_dims, 3, iStream), + x_num(gi.x_num.data(), gi.x_num.size(), iStream), + y_num(gi.y_num.data(), gi.y_num.size(), iStream), + z_num(gi.z_num.data(), gi.z_num.size(), iStream), + total_number_particles_pinned(true), + level_size(gi.level_size.data(), gi.level_size.size(), iStream) + { + total_number_particles_pinned.resize(1); + total_number_particles_pinned[0] = gi.total_number_particles; + total_number_particles.initialize(total_number_particles_pinned.data(), 1, iStream); + } + + GenInfoCuda getGenInfoCuda() { + GenInfoCuda gic; + + gic.l_min = gi.l_min; + gic.l_max = gi.l_max; + gic.org_dims = org_dims.get(); + gic.number_dimensions = gi.number_dimensions; + gic.x_num = x_num.get(); + gic.y_num = y_num.get(); + gic.z_num = z_num.get(); + gic.total_number_particles = total_number_particles.get(); + gic.level_size = level_size.get(); + + return gic; + } + + ~GenInfoGpuAccess() { + // TODO: When freeing stream in GpuProcessingTaskImpl is fixed this should be uncommented + // copyDtoH(); + } + + void copyHtoD() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + + // Check description of 'total_number_particles_pinned' for explanation + total_number_particles_pinned[0] = gi.total_number_particles; + total_number_particles.copyH2D(); + } + + void copyDtoH() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + + // Check description of 'total_number_particles_pinned' for explanation + total_number_particles.copyD2H(); + checkCuda(cudaStreamSynchronize(iStream)); + gi.total_number_particles = total_number_particles_pinned[0]; + } +}; + +#endif // GEN_INFO_GPU_ACCESS_CUH \ No newline at end of file diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index bc410050..03b9af36 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -3,93 +3,6 @@ #include "misc/CudaTools.cuh" #include "algorithm/ParticleCellTreeCuda.cuh" -// CUDA version of GenInfo structure -typedef struct GenInfoCuda_t { - int l_min; - int l_max; - - int *org_dims; // fixed size: [3] - - uint8_t number_dimensions; - - int *x_num; - int *y_num; - int *z_num; - - // this differs from original GenInfo structure - // since we need to be able to send data back from GPU to CPU - uint64_t *total_number_particles; - - int *level_size; - - uint64_t get_total_number_particles() const { return *total_number_particles; } - - __device__ int level_max() const { return l_max; } - __device__ int level_min() const { return l_min; } - -} GenInfoCuda; - -// ----------------------------- - -/* - * Class for easy transfering to/from GPU of GenInfo structure. - */ -class GenInfoGpuAccess { - GenInfo &gi; - - cudaStream_t iStream; - - ScopedCudaMemHandler org_dims; - ScopedCudaMemHandler x_num; - ScopedCudaMemHandler y_num; - ScopedCudaMemHandler z_num; - ScopedCudaMemHandler total_number_particles; - ScopedCudaMemHandler level_size; - - -public: - GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) : - gi(genInfo), - iStream(cudaStream), - org_dims(gi.org_dims, 3, iStream), - x_num(gi.x_num.data(), gi.x_num.size(), iStream), - y_num(gi.y_num.data(), gi.y_num.size(), iStream), - z_num(gi.z_num.data(), gi.z_num.size(), iStream), - total_number_particles(&gi.total_number_particles, 1, iStream), - level_size(gi.level_size.data(), gi.level_size.size(), iStream) - { - } - - GenInfoCuda getGenInfoCuda() { - GenInfoCuda gic; - - gic.l_min = gi.l_min; - gic.l_max = gi.l_max; - gic.org_dims = org_dims.get(); - gic.number_dimensions = gi.number_dimensions; - gic.x_num = x_num.get(); - gic.y_num = y_num.get(); - gic.z_num = z_num.get(); - gic.total_number_particles = total_number_particles.get(); - gic.level_size = level_size.get(); - - return gic; - } - - ~GenInfoGpuAccess() { - copyDtoH(); - } - - void copyHtoD() { - // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) - total_number_particles.copyH2D(); - } - - void copyDtoH() { - // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) - total_number_particles.copyD2H(); - } -}; // ********************************************************************************************************************* // FULL RESOLUTION @@ -155,7 +68,7 @@ void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization -__global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { +__global__ void firstStepLAC(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; const uint64_t xLen = gic.x_num[level]; @@ -179,15 +92,15 @@ __global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int leve } void runFirstStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { - dim3 threadsPerBlock(32, 1, 1); - + dim3 threadsPerBlock(128, 1, 1); + auto genInfoCuda = giga.getGenInfoCuda(); for (int level = gi.l_min + 1; level < gi.l_max; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); auto *p_mapPrev = p_map[level - 1]; auto *p_mapCurr = p_map[level]; - firstStep<<>>(p_mapPrev, p_mapCurr, level, min_type, giga.getGenInfoCuda()); + firstStepLAC<<>>(p_mapPrev, p_mapCurr, level, min_type, genInfoCuda); } cudaError_t err = cudaGetLastError(); @@ -309,15 +222,41 @@ __global__ void secondStepLastLevel(const uint8_t *currLevel, int level_minus_1, } } -__global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total) { - // std::partial_sum on one CUDA core naive implementation - size_t sum = xz_end[0]; - for (size_t i = 1; i < counter_total; i++) { - sum += xz_end[i]; - xz_end[i] = sum; + +__global__ void secondStepCountParticles(GenInfoCuda gic, uint64_t *xz_end, uint64_t counter_total) { + extern __shared__ uint64_t b[]; + const int idx = threadIdx.x; + // std::partial_sum on one CUDA core implementation + // + // How it works: + // 1. Each thread reads a value from xz_end into shared memory as coalesced memory access to speed up process + // 2. The first thread (idx == 0) computes the partial sum for the block (this is fast enough) + // 3. The partial sums are written back to xz_end in coalesced memory manner. + // In general sice this is multi-warp code __syncthreads() are used to be sure that all threads are at same + // place in code. + + uint64_t sum = 0; + for (uint64_t i = 0; i < counter_total; i+= blockDim.x) { + if ((i + idx) < counter_total) b[idx] = xz_end[i + idx]; + __syncthreads(); + + if (idx == 0) { + size_t endIdx = blockDim.x; + if ((i + blockDim.x - 1) >= counter_total) { + endIdx = counter_total - i; + } + for (size_t n = 0; n < endIdx; n++) { + sum += b[n]; + b[n] = sum; + } + } + __syncthreads(); + + if ((i + idx) < counter_total) xz_end[i + idx] = b[idx]; + __syncthreads(); } - *gic.total_number_particles = xz_end[counter_total -1]; + if (idx == 0) *gic.total_number_particles = xz_end[counter_total - 1]; } void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { @@ -336,7 +275,8 @@ void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleC throw std::runtime_error("runSecondStepLastLevel #1 failed"); } - secondStepCountParticles<<<1, 1, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, counter_total); + int numOfWorkers = 128; + secondStepCountParticles<<<1, numOfWorkers, numOfWorkers * sizeof(uint64_t), aStream>>>(giga.getGenInfoCuda(), xz_end, counter_total); err = cudaGetLastError(); if (err != cudaSuccess) { @@ -511,15 +451,14 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu /* + * This function is for testing purposes only. + * * This function does everything: * - creates CPU structures * - copies everything to GPU * - run computation of all linear-structures * - copy it back to CPU * - returns all the structure - * - * In current shape it is a good function for testing implementation rather than using it in production code. - * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. */ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { @@ -546,10 +485,10 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara } xz_end_vec.resize(counter_total, 0); -// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; -// prt(y_vec); -// prt(xz_end_vec); -// prt(level_xz_vec); + // auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; + // prt(y_vec); + // prt(xz_end_vec); + // prt(level_xz_vec); // TODO: This is temporary solution. // Since in CPU code size of y_vec is calculated 'on the fly' and in CUDA code it would be much better @@ -574,6 +513,8 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), aStream); runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), counter_total, aStream); } + // TODO: This is temporary solution. Fix GIGA destructor and stream handling! + giga.copyDtoH(); } // TODO: Resized back to correct size, should it be initialized to this size in the first place or pre-allocation for @@ -593,46 +534,19 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara return lac; } -void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) { - - uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; +void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream) { - VectorData xz_end_vec(true); - VectorData level_xz_vec(true); + const uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; - // initialize_xz_linear() - CPU impl. - uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. - level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; - level_xz_vec[0] = 1; //allowing for the offset. - for (int i = 0; i <= gi.l_max; ++i) { - counter_total += gi.x_num[i] * gi.z_num[i]; - level_xz_vec[i + 1] = counter_total; + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, gi, giga, aStream); } - xz_end_vec.resize(counter_total, 0); - - - { - ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size(), aStream); - ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size(), aStream); - GenInfoGpuAccess giga(gi, aStream); - if (gi.l_max <= 2) { - runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream); - } - else { - runFirstStep(gi, giga, p_map, min_type, aStream); - runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); - runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); - runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream); - runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream); - } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, counter_total, aStream); } - - VectorData y_vec(true); - y_vec.resize(gi.total_number_particles); - checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); - checkCuda(cudaStreamSynchronize(aStream)); - - lacs.y_vec.swap(y_vec); - lacs.xz_end_vec.swap(xz_end_vec); - lacs.level_xz_vec.swap(level_xz_vec); + giga.copyDtoH(); } diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 27d56ab6..69ce4ebe 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -12,9 +12,12 @@ typedef struct { VectorData level_xz_vec; } LinearAccessCudaStructs; +#include "data_structures/APR/access/GenInfoGpuAccess.cuh" + +// This is for testing purposes only LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); -void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream); +void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream); #endif //APR_LINEARACCESSCUDA_HPP diff --git a/src/data_structures/Mesh/paddPixelData.cuh b/src/data_structures/Mesh/paddPixelData.cuh index dae96d79..1060ed8c 100644 --- a/src/data_structures/Mesh/paddPixelData.cuh +++ b/src/data_structures/Mesh/paddPixelData.cuh @@ -38,7 +38,7 @@ __global__ void paddPixels(const T* input, T *output, const PixelDataDim inputSi template void runPaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { - dim3 threadsPerBlock(1, 64, 1); + dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); @@ -70,7 +70,7 @@ __global__ void unpaddPixels(const T* input, T *output, const PixelDataDim input template void runUnpaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { - dim3 threadsPerBlock(1, 64, 1); + dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 351c0009..167be6e8 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -18,7 +18,7 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= #if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) if (code != cudaSuccess) { - fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + fprintf(stderr,"GPUassert: (%d) (%s) %s %d\n", code, cudaGetErrorString(code), file, line); assert(code == cudaSuccess); // If debugging it helps to see call tree somehow if (abort) exit(code); } diff --git a/test/APRTest.cpp b/test/APRTest.cpp index 83071a7f..1ee948c3 100644 --- a/test/APRTest.cpp +++ b/test/APRTest.cpp @@ -2797,7 +2797,13 @@ bool test_pipeline_u16(TestData& test_data){ aprConverter.par.output_steps = true; +#ifdef APR_USE_CUDA + // NOTICE: CUDA implementation is not saving intermediate steps even if "par.output_steps = true" so this test is not + // valid for CUDA. Use explicitly CPU implementation anyway just to have all tests run even if APR_USE_CUDA=ON + aprConverter.get_apr_cpu(apr, test_data.img_original); +#else aprConverter.get_apr(apr,test_data.img_original); +#endif PixelData scale_computed = TiffUtils::getMesh(test_data.output_dir +"local_intensity_scale_step.tif"); PixelData gradient_computed = TiffUtils::getMesh(test_data.output_dir + "gradient_step.tif"); @@ -3012,7 +3018,13 @@ bool test_pipeline_bound_blocked(TestData& test_data, float rel_error){ converter.par.output_steps = true; APR apr; +#ifdef APR_USE_CUDA + // NOTICE: CUDA implementation is not saving intermediate steps even if "par.output_steps = true" so this test is not + // valid for CUDA. Use explicitly CPU implementation anyway just to have all tests run even if APR_USE_CUDA=ON + converter.get_apr_cpu(apr, test_data.img_original); +#else converter.get_apr(apr, test_data.img_original); +#endif // batch APR converter for blocked conversion APRConverterBatch converterBatch; @@ -3022,6 +3034,7 @@ bool test_pipeline_bound_blocked(TestData& test_data, float rel_error){ // Get the APR by block APR aprBatch; + converterBatch.get_apr(aprBatch); // Sample particles by block @@ -3835,34 +3848,25 @@ TEST_F(CreateSmallSphereTest, ITERATOR_METHODS) { ASSERT_TRUE(test_iterator_methods(test_data)); } -TEST_F(CreateSmallSphereTest, AUTO_PARAMETERS) { - -//test iteration -ASSERT_TRUE(test_auto_parameters(test_data)); +#ifndef APR_USE_CUDA +/// auto_parameters are not supported in CUDA +TEST_F(CreateSmallSphereTest, AUTO_PARAMETERS) { + ASSERT_TRUE(test_auto_parameters(test_data)); } TEST_F(CreateDiffDimsSphereTest, AUTO_PARAMETERS) { - -//test iteration -ASSERT_TRUE(test_auto_parameters(test_data)); - + ASSERT_TRUE(test_auto_parameters(test_data)); } TEST_F(CreateGTSmall2DTestProperties, AUTO_PARAMETERS) { - -//test iteration -ASSERT_TRUE(test_auto_parameters(test_data)); - + ASSERT_TRUE(test_auto_parameters(test_data)); } TEST_F(CreateGTSmall1DTestProperties, AUTO_PARAMETERS) { - -//test iteration ASSERT_TRUE(test_auto_parameters(test_data)); - } - +#endif TEST_F(CreateSmallSphereTest, APR_ITERATION) { diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 913b7e09..82489cd0 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -358,6 +358,57 @@ namespace { } } + + TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GPU_via_APRConverter) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = uint16_t; + std::string file_name = get_source_directory_apr() + "files/Apr/sphere_120/sphere_original.tif"; + PixelData input_image_raw = TiffUtils::getMesh(file_name); + std::cout << input_image_raw << std::endl; + + // Prepare parameters + APRParameters par; + par.lambda = 2; + par.Ip_th = -1; + par.sigma_th = 234; + par.sigma_th_max = 0; + par.grad_th=10; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + par.auto_parameters = false; + par.output_steps = false; + par.neighborhood_optimization = true; + par.sigma_th = 234; + std::cout << par << std::endl; + APR apr; + APRConverter converter; + converter.par = par; + converter.set_generate_linear(true); + converter.set_sparse_pulling_scheme(false); + converter.get_apr_cuda(apr, input_image_raw); + std::cout << "APR CUDA total particles: " << apr.total_number_particles() << std::endl; + + APR apr2; + APRConverter converter2; + converter2.par = par; + converter2.set_generate_linear(true); + converter2.set_sparse_pulling_scheme(false); + converter2.get_apr_cpu(apr2, input_image_raw); + std::cout << "APR CPU total particles: " << apr2.total_number_particles() << std::endl; + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareParticles(apr.linearAccess.y_vec, apr2.linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(apr.linearAccess.level_xz_vec, apr2.linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(apr.linearAccess.xz_end_vec, apr2.linearAccess.xz_end_vec), 0); + + EXPECT_EQ(apr.total_number_particles(), apr2.total_number_particles()); + EXPECT_EQ(apr.linearAccess.y_vec.size(), apr2.linearAccess.y_vec.size()); + + } #endif // APR_USE_CUDA } From 7f6e2d3c5d622ee63909cf8751295ab5de018ab4 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 5 Aug 2025 13:25:40 +0200 Subject: [PATCH 66/80] Reverting APRCOnverter type to previoius value --- examples/Example_get_apr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Example_get_apr.cpp b/examples/Example_get_apr.cpp index 6ac8e9c7..bfbaf31d 100644 --- a/examples/Example_get_apr.cpp +++ b/examples/Example_get_apr.cpp @@ -48,7 +48,7 @@ int runAPR(cmdLineOptions options) { //the apr datastructure APR apr; - APRConverter aprConverter; + APRConverter aprConverter; //read in the command line options into the parameters file aprConverter.par.Ip_th = options.Ip_th; From b01df3115439050fbfa50fa003fce4595ad1ff29 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 5 Aug 2025 16:04:22 +0200 Subject: [PATCH 67/80] Fixed CUDA-streams sync issues when copying back to CPU --- src/algorithm/APRConverter.hpp | 16 ++++----- src/algorithm/ComputeGradientCuda.cu | 34 +++++++++---------- .../APR/access/LinearAccessCuda.cu | 1 - 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 6141887c..3f41276f 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -400,10 +400,11 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input if (!initPipelineAPR(aAPR, input_image)) return false; + total_timer.start_timer("full_pipeline"); initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image) ///////////////////////////////// /// Pipeline @@ -435,6 +436,8 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input std::cout << "CUDA pipeline finished!\n"; + total_timer.stop_timer(); + return true; } #endif @@ -465,14 +468,11 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData::value) { - bspline_offset = 100; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else if (std::is_same::value) { - bspline_offset = 5; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else { + if (std::is_floating_point::value) { image_temp.copyFromMesh(input_image); + } else { + bspline_offset = compute_bspline_offset(input_image, par.lambda); + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index dc55207d..883a54b3 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -447,48 +447,46 @@ public: } void processOnGpu() { + // Set it and copy first before copying the image + // It improves *a lot* performance even though it is needed later in computeLinearStructureCuda() + iAprInfo.total_number_particles = 0; // reset total_number_particles to 0 + giga.copyHtoD(); + level_xz_vec_cuda.copyH2D(); + image.copyH2D(); - CurrentTime ct{}; - uint64_t start = ct.microseconds(); - CudaTimer time(false, "PIPELINE"); - time.start_timer("getgradient"); getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); - time.stop_timer(); - time.start_timer("intensity"); + runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream); - time.stop_timer(); // Apply parameters from APRConverter: - time.start_timer("runs...."); runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); runThresholdOpen(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); // TODO: automatic parameters are not implemented for GPU pipeline (yet) - time.stop_timer(); - time.start_timer("compute lev"); float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - time.stop_timer(); computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); - - level_xz_vec_cuda.copyH2D(); - iAprInfo.total_number_particles = 0; // reset total_number_particles to 0 - giga.copyHtoD(); computeLinearStructureCuda(y_vec_cuda.get(), xz_end_vec_cuda.get(), level_xz_vec_cuda.get(), pctc, iAprInfo, giga, iParameters, counter_total, iStream); - xz_end_vec_cuda.copyD2H(); + // Get data from GPU - first we need to get number of particles to resize y_vec and have idea how many particles to copy - that is why we need to synchronize first time + giga.copyDtoH(); + checkCuda(cudaStreamSynchronize(iStream)); - // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) + // Start copying the data from GPU to CPU + xz_end_vec_cuda.copyD2H(); + // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) and copy data from GPU y_vec.resize(iAprInfo.total_number_particles); - + // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream)); + + // Synchornize last time - at that moment all data from GPU is copied to CPU checkCuda(cudaStreamSynchronize(iStream)); // Prepare CPU structures diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 03b9af36..5695f4c1 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -548,5 +548,4 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, aStream); runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, counter_total, aStream); } - giga.copyDtoH(); } From bf4cdb0218f75fe58a30186bce08b35692f590c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 6 Aug 2025 16:44:07 +0200 Subject: [PATCH 68/80] Little bit cleanup of CUDA in APRConverter, Added move assignment operator to VectorData --- src/algorithm/APRConverter.hpp | 210 ++++++------------------- src/algorithm/ComputeGradientCuda.cu | 10 +- src/algorithm/ComputeGradientCuda.hpp | 1 - src/data_structures/Mesh/PixelData.hpp | 14 ++ test/FullPipelineCudaTest.cpp | 3 +- 5 files changed, 65 insertions(+), 173 deletions(-) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 3f41276f..f81e1114 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -422,7 +422,6 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); // std::cout << "after gpt \n"; - gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); @@ -453,21 +452,18 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input */ template template inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData& input_image) { - + // Initialize APR and memory for the pipeline if (!initPipelineAPR(aAPR, input_image)) return false; - initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - - computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image) ///////////////////////////////// /// Pipeline - //////////////////////// + ///////////////////////////////// + // offset image by factor (this is required if there are zero areas in the background with // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) // Warning both of these could result in over-flow! - if (std::is_floating_point::value) { image_temp.copyFromMesh(input_image); } else { @@ -475,176 +471,68 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData>>>>>>>>>> START\n"; - ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); - { - APRTimer t(true); - std::vector> gpts; + constexpr int repetitionsPerStream = 3; // number of repetitions per stream to simulate processing of multiple images - t.start_timer("Creating GPTS"); - std::vector> gpts_futures; gpts_futures.resize(numOfStreams); - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - } - t.stop_timer(); - - t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); - { - APRTimer tt(false); - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - tt.start_timer("SEND"); - // gpts[i].sendDataToGpu(); - // gpts[i].processOnGpu(); - tt.stop_timer(); - // std::cout << "Send " << i << std::endl; - // gpts.back().processOnGpu(); - // std::cout << "Proc " << i << std::endl; - } - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); - // tt.start_timer("Process"); - // gpts[i].processOnGpu(); - // tt.stop_timer(); - // std::cout << "Proc " << i << std::endl; - } - std::cout << "=========" << std::endl; + APRTimer ttt(true); - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; + ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); + { + APRTimer t(true); + std::vector> gpts; - // get data from previous task - gpts_futures[c].get(); - auto linearAccessGpu = gpts[c].getDataFromGpu(); + t.start_timer("Creating GPTS"); + std::vector> gpts_futures; gpts_futures.resize(numOfStreams); + for (int i = 0; i < numOfStreams; ++i) { + gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + } + t.stop_timer(); - // in theory, we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - // gpts[c].sendDataToGpu(); - // std::cout << "Send " << c << std::endl; - // gpts[c].processOnGpu(); - gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); - // std::cout << "Proc " << c << std::endl; - } + t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); + { + APRTimer tt(false); + // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline + for (int i = 0; i < numOfStreams; ++i) { + gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); + } - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { + int c = i % numOfStreams; - // generateDatastructures(aAPR) for linearAcceess for CUDA - aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); - aAPR.apr_initialized = true; + // Get data from GpuProcessingTask - get() will block until the task is finished + gpts_futures[c].get(); + auto linearAccessGpu = gpts[c].getDataFromGpu(); - // std::cout << "CUDA pipeline finished!\n"; + // in theory, we get new data and send them to task + if (i < numOfStreams * (repetitionsPerStream - 1)) { + gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); } - // cudaDeviceSynchronize(); - } - auto allT = t.stop_timer(); - std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; - std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n"; - } - auto allT = ttt.stop_timer(); - float tpi = allT / (numOfStreams*repetitionsPerStream); - std::cout << "Time per image: " << tpi << " seconds\n"; - std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n"; - std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n"; - - std::cout << "<<<<<<<<<<<< STOP\n"; - } - else { - APRTimer ttt(true); - std::cout << ">>>>>>>>>>> START\n"; - ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); - { - APRTimer t(true); - std::vector> gpts; + // Fill APR data structure with data from GPU + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); - t.start_timer("Creating GPTS"); - //std::vector> gpts_futures; gpts_futures.resize(numOfStreams); - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - } - // cudaDeviceSynchronize(); - t.stop_timer(); - - t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); - { - - APRTimer tt(false); - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - // gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - tt.start_timer("SEND"); - gpts[i].sendDataToGpu(); - gpts[i].processOnGpu(); - tt.stop_timer(); - // std::cout << "Send " << i << std::endl; - // gpts.back().processOnGpu(); - // std::cout << "Proc " << i << std::endl; - } - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); - tt.start_timer("Process"); - // gpts[i].processOnGpu(); - tt.stop_timer(); - // std::cout << "Proc " << i << std::endl; - } - std::cout << "=========" << std::endl; - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - // get data from previous task - // gpts_futures[c].get(); - auto linearAccessGpu = gpts[c].getDataFromGpu(); - // std::cout << "Get " << c << std::endl; - - // in theory, we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].sendDataToGpu(); - // std::cout << "Send " << c << std::endl; - gpts[c].processOnGpu(); - // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); - // std::cout << "Proc " << c << std::endl; - } - - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - - // generateDatastructures(aAPR) for linearAcceess for CUDA - aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); - aAPR.apr_initialized = true; - - // std::cout << "CUDA pipeline finished!\n"; - } - // cudaDeviceSynchronize(); + aAPR.apr_initialized = true; } - auto allT = t.stop_timer(); - std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; } - auto allT = ttt.stop_timer(); + auto allT = t.stop_timer(); std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; - std::cout << "<<<<<<<<<<<< STOP\n"; + std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n"; } + auto allT = ttt.stop_timer(); + float tpi = allT / (numOfStreams*repetitionsPerStream); + std::cout << "Time per image: " << tpi << " seconds\n"; + std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n"; + std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n"; - - return false; //TODO: change it back to true + return true; } #endif + /** * Implementation of pipeline for CPU * @@ -715,8 +603,8 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag #ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else - return get_apr_cuda(aAPR, input_image); - // return get_apr_cuda_streams(aAPR, input_image); + // return get_apr_cuda(aAPR, input_image); + return get_apr_cuda_streams(aAPR, input_image); #endif } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 883a54b3..ef4083b8 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -406,7 +406,7 @@ public: splineCudaX = cudax.first; splineCudaY = cuday.first; splineCudaZ = cudaz.first; - std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n"; + // std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n"; // std::cout << iCpuImage << std::endl; // std::cout << iCpuLevels << std::endl; @@ -437,11 +437,6 @@ public: isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream); } - void sendDataToGpu() { - // sends data in processOnGpu() - // in multi-stream implementation it is done in threads so is not blocking current operations. - } - LinearAccessCudaStructs getDataFromGpu() { return std::move(lacs); } @@ -508,9 +503,6 @@ GpuProcessingTask::~GpuProcessingTask() { } template GpuProcessingTask::GpuProcessingTask(GpuProcessingTask&&) = default; -template -void GpuProcessingTask::sendDataToGpu() {impl->sendDataToGpu();} - template LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 837d29f5..82fa06c8 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -46,7 +46,6 @@ class GpuProcessingTask { ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); - void sendDataToGpu(); LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); }; diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index f0127920..ca06a2bc 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -293,6 +293,20 @@ public : #endif } + /** + * Move assignment operator + * @param aObj + */ + VectorData& operator=(VectorData &&aObj) { + usePinnedMemory = aObj.usePinnedMemory; + vecMemory.swap(aObj.vecMemory); + vec = std::move(aObj.vec); +#ifdef APR_USE_CUDA + vecMemoryPinned = std::move(aObj.vecMemoryPinned); +#endif + return *this; + } + /** * Apply unary operator to each element in parallel, writing the result to VectorData 'output'. * @tparam S diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 82489cd0..2cb575fc 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -340,7 +340,6 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel); - gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); @@ -359,7 +358,7 @@ namespace { } - TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GPU_via_APRConverter) { + TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GPU_via_APRConverter) { APRTimer timer(true); // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors From cd7d594f45dee0464ec5d8e28c011358b5c24669 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 11 Aug 2025 16:57:13 +0200 Subject: [PATCH 69/80] Fixed move construtor/assignment - pinned memory is now also moved. --- src/data_structures/Mesh/PixelData.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index ca06a2bc..c1ef8215 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -482,6 +482,9 @@ public : z_num = aObj.z_num; mesh = std::move(aObj.mesh); meshMemory = std::move(aObj.meshMemory); +#ifdef APR_USE_CUDA + meshMemoryPinned = std::move(aObj.meshMemoryPinned); +#endif } /** @@ -494,6 +497,9 @@ public : z_num = aObj.z_num; mesh = std::move(aObj.mesh); meshMemory = std::move(aObj.meshMemory); +#ifdef APR_USE_CUDA + meshMemoryPinned = std::move(aObj.meshMemoryPinned); +#endif return *this; } From 0c702f3b3e4b3ddaac391cbb4cca1c57518bdc7e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 19 Aug 2025 12:07:03 +0200 Subject: [PATCH 70/80] Fixes needed by CUDA 13.0 - now code compiles --- CMakeLists.txt | 8 ++++---- examples/CMakeLists.txt | 1 + src/numerics/miscCuda.cu | 5 +++-- test/CMakeLists.txt | 1 + 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 13d764b5..de7963a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ project(APR DESCRIPTION "Adaptive Particle Representation library") message(STATUS "CMAKE VERSION ${CMAKE_VERSION}") -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) if(POLICY CMP0135) @@ -171,7 +171,7 @@ if(WIN32) else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 ") if(CMAKE_COMPILER_IS_GNUCC) set(CMAKE_CXX_FLAGS_RELEASE "-O4") @@ -210,9 +210,9 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON) if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") - set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA") set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 57b4ed4d..5837784a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,6 @@ macro(buildTarget TARGET) add_executable(${TARGET} ${TARGET}.cpp) + set_property(TARGET ${TARGET} PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(${TARGET} ${HDF5_LIBRARIES} ${TIFF_LIBRARIES} ${APR_BUILD_LIBRARY} Threads::Threads ${OPENMP_LINK}) endmacro(buildTarget) diff --git a/src/numerics/miscCuda.cu b/src/numerics/miscCuda.cu index 0076eb45..e881fa4b 100644 --- a/src/numerics/miscCuda.cu +++ b/src/numerics/miscCuda.cu @@ -201,8 +201,9 @@ __global__ void count_ne_rows_cuda(const uint64_t* level_xz_vec, } } - -__device__ unsigned int count = 0; +namespace { + __device__ unsigned int count = 0; +} __global__ void fill_ne_rows_cuda(const uint64_t* level_xz_vec, const uint64_t* xz_end_vec, const int z_num, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d3377fb0..2750b172 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,6 @@ macro(buildTarget TARGET SRC) add_executable(${TARGET} ${SRC}) + set_property(TARGET ${TARGET} PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(${TARGET} ${HDF5_LIBRARIES} ${TIFF_LIBRARIES} ${GTEST_LIBRARIES} ${APR_BUILD_LIBRARY} Threads::Threads ${OPENMP_LINK}) add_test( ${TARGET} ${TARGET} ) endmacro(buildTarget) From 1f27876ad05bcbcd3f456f31c19a964c90e98d61 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 21 Aug 2025 15:03:48 +0200 Subject: [PATCH 71/80] Initial impl. of CUDA multistreams, it takes many images but STILL only one ARP object - use it only for speed for now --- src/algorithm/APRConverter.hpp | 123 +++++++++++++++++++++++++- src/algorithm/ComputeGradientCuda.cu | 14 +-- src/algorithm/ComputeGradientCuda.hpp | 4 +- test/FullPipelineCudaTest.cpp | 3 +- 4 files changed, 134 insertions(+), 10 deletions(-) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index f81e1114..8ff3b674 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -77,6 +77,8 @@ class APRConverter { bool get_apr_cuda(APR &aAPR, PixelData &input_image); template bool get_apr_cuda_streams(APR &aAPR, PixelData &input_image); + template + bool get_apr_cuda_multistreams(APR &aAPR, const std::vector *> &input_images, int numOfStreams = 3); #endif bool verbose = true; @@ -420,8 +422,9 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); } - GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); + GpuProcessingTask gpt(image_temp, local_scale_temp, par, aAPR.level_max()); // std::cout << "after gpt \n"; + gpt.setBsplineOffset(bspline_offset); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); @@ -442,6 +445,116 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input #endif #ifdef APR_USE_CUDA +/** + * Implementation of pipeline for GPU/CUDA and multiple streams + * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be. + * Finally, it should be able to process incoming stream of data (sequence of images). + * + * @param aAPR - the APR data structure + * @param input_images - input images + * @param numOfStreams - number of streams to use for parallel processing on GPU + */ +template template +inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const std::vector*> &input_images, int numOfStreams) { + int numOfImages = input_images.size(); + if (numOfImages == 0) { + std::cerr << "No input images provided for APR conversion." << std::endl; + return false; + } + + // Reduce number of streams to number of images if there are less images than streams + if (numOfImages < numOfStreams) numOfStreams = numOfImages; + + // Use first image to initialize the APR - all other images should have the same dimensions + auto input_image = input_images[0]; + + // Initialize APR and memory for the pipeline + if (!initPipelineAPR(aAPR, *input_image)) return false; + initPipelineMemory(input_image->y_num, input_image->x_num, input_image->z_num); + + // Create a temporary image for each stream + std::vector> tempImages; + std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl; + for (int i = 0; i < numOfStreams; ++i) { + tempImages.emplace_back(PixelData(*input_image, false /* don't copy */, true /* pinned memory */)); + } + + ///////////////////////////////// + /// Pipeline + ///////////////////////////////// + APRTimer t(true); + + // Create GpuProcessingTask for each stream + std::vector> gpts; + t.start_timer("Creating GPTS"); + std::vector> gpts_futures; gpts_futures.resize(numOfStreams); + for (int i = 0; i < numOfStreams; ++i) { + gpts.emplace_back(GpuProcessingTask(tempImages[i], local_scale_temp, par, aAPR.level_max())); + } + t.stop_timer(); + + + t.start_timer("GPU processing..."); + // Saturate all the streams with first images + for (int i = 0; i < numOfStreams; ++i) { + + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! + if (std::is_floating_point::value) { + tempImages[i].copyFromMesh(*input_images[i]); + } else { + bspline_offset = compute_bspline_offset(*input_images[i], par.lambda); + tempImages[i].copyFromMeshWithUnaryOp(*input_images[i], [=](const auto &a) { return (a + bspline_offset); }); + } + std::cout << "Processing image " << i << " on stream " << i << std::endl; + gpts[i].setBsplineOffset(bspline_offset); + gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); + } + + + // Main loop - get results from GPU and send new images to the streams (if any left) + for (int s = 0; s < numOfImages; ++s) { + int streamNum = s % numOfStreams; + + // Get data from GpuProcessingTask - get() will block until the task is finished + gpts_futures[streamNum].get(); + auto linearAccessGpu = gpts[streamNum].getDataFromGpu(); + + // Send next images to the stream if there are any left + // We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images + if (s < numOfImages - numOfStreams) { + int imageToProcess = s + numOfStreams; + if (std::is_floating_point::value) { + tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]); + } else { + bspline_offset = compute_bspline_offset(*input_images[imageToProcess], par.lambda); + tempImages[streamNum].copyFromMeshWithUnaryOp(*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); }); + } + std::cout << "Processing image " << imageToProcess << " on stream " << streamNum << std::endl; + gpts[streamNum].setBsplineOffset(bspline_offset); + gpts_futures[streamNum] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[streamNum]); + } + + // Fill APR data structure with data from GPU + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); + + aAPR.apr_initialized = true; + } + + auto allT = t.stop_timer(); + float tpi = allT / (numOfImages); + std::cout << "Num of images processed: " << numOfImages << "\n"; + std::cout << "Time per image: " << tpi << " seconds\n"; + std::cout << "Image size: " << (input_images[0]->size() / 1024 / 1024) << " MB\n"; + std::cout << "Bandwidth:" << (input_images[0]->size() / tpi / 1024 / 1024) << " MB/s\n"; + std::cout << "CUDA multistream pipeline finished!\n"; + return true; +} + /** * Implementation of pipeline for GPU/CUDA and multiple streams * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be. @@ -485,7 +598,7 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData> gpts_futures; gpts_futures.resize(numOfStreams); for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); + gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, aAPR.level_max())); } t.stop_timer(); @@ -494,6 +607,7 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData::processOnGpu, &gpts[i]); } @@ -506,6 +620,7 @@ inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData::processOnGpu, &gpts[c]); } @@ -604,7 +719,9 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag return get_apr_cpu(aAPR, input_image); #else // return get_apr_cuda(aAPR, input_image); - return get_apr_cuda_streams(aAPR, input_image); + // return get_apr_cuda_streams(aAPR, input_image); + std::vector *> input_images(3*66, &input_image); + return get_apr_cuda_multistreams(aAPR, input_images, 3); #endif } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index ef4083b8..c624ccb9 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -326,7 +326,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { PixelData &iCpuLevels; const APRParameters &iParameters; GenInfo iAprInfo; - float iBsplineOffset; + float iBsplineOffset = 0; int iMaxLevel; // cuda stuff - memory and stream to be used @@ -377,7 +377,7 @@ public: // TODO: Remove need for passing 'levels' to GpuProcessingTask // It was used during development to control internal computation like filters, gradient, levels etc. but // once all is done there is no need for it anymore - GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : + GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, int maxLevel) : iCpuImage(inputImage), iCpuLevels(levels), iStream(cudaStream.get()), @@ -387,7 +387,6 @@ public: local_scale_temp2 (levels, iStream), iParameters(parameters), iAprInfo(iCpuImage.getDimension()), - iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)), cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)), @@ -490,12 +489,14 @@ public: lacs.y_vec.copy(y_vec); } + void setBsplineOffset(float offset) {iBsplineOffset = offset;} + ~GpuProcessingTaskImpl() {} }; template -GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) -: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} { } +GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, int maxLevel) +: impl{new GpuProcessingTaskImpl(image, levels, parameters, maxLevel)} { } template GpuProcessingTask::~GpuProcessingTask() { } @@ -509,6 +510,9 @@ LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return imp template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} +template +void GpuProcessingTask::setBsplineOffset(float offset) {impl->setBsplineOffset(offset);} + // explicit instantiation of handled types template class GpuProcessingTask; template class GpuProcessingTask; diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 82fa06c8..8e3d3d7d 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -42,12 +42,14 @@ class GpuProcessingTask { public: - GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); + GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, int maxLevel); ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); + + void setBsplineOffset(float bspline_offset); }; #endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 2cb575fc..a86edbdc 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -339,7 +339,8 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel); + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel); + gpt.setBsplineOffset(bspline_offset); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); From 0ba2f45f6cd95190f1bcd6df84720f63165ec8c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 29 Aug 2025 15:21:24 +0200 Subject: [PATCH 72/80] Fixes needed by CUDA 13.0 - now code compiles also on CUDA 12.x --- CMakeLists.txt | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de7963a4..c218f02e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,16 +209,29 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON) if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") - set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") +# set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") set(CMAKE_CUDA_STANDARD 17) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES native) + endif() + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK") endif() enable_language(CUDA) + + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 13.0.0) + # For CUDA versions > 13.0.0 keep compatibility with older CUDAs - new CUDA introduce changes that + # impact ELF visibility and linkage for __global__ functions and device variables + set(CMAKE_CUDA_FLAGS "--device-entity-has-hidden-visibility=false -static-global-template-stub=false ${CMAKE_CUDA_FLAGS}") + endif () + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPR_USE_CUDA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPR_USE_CUDA") set(APR_CUDA_SOURCE_FILES @@ -243,8 +256,9 @@ if(APR_BUILD_STATIC_LIB) # generate static library used as a intermediate step in generating fat lib set(STATIC_TARGET_NAME staticLib) add_library(${STATIC_TARGET_NAME} STATIC $ ${APR_CUDA_SOURCE_FILES}) + set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_SEPARABLE_COMPILATION ON) set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) - target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14) + target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_17) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) target_include_directories(${STATIC_TARGET_NAME} PUBLIC $ $) @@ -266,6 +280,7 @@ if(APR_BUILD_SHARED_LIB) set(SHARED_TARGET_NAME sharedLib) add_library(${SHARED_TARGET_NAME} SHARED $ ${APR_CUDA_SOURCE_FILES}) set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_include_directories(${SHARED_TARGET_NAME} PUBLIC $ $) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME}) From 0a44736552536d2837c4001efdc882db7b45d8c9 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 4 Sep 2025 15:57:31 +0200 Subject: [PATCH 73/80] CUDA version of getMinMax added (for finding bspline offset) --- src/algorithm/ComputeGradientCuda.cu | 44 +++++++++ src/algorithm/ComputeGradientCuda.hpp | 1 + src/algorithm/findMinMax.cuh | 134 ++++++++++++++++++++++++++ test/ComputeGradientCudaTest.cpp | 48 +++++++++ 4 files changed, 227 insertions(+) create mode 100644 src/algorithm/findMinMax.cuh diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index c624ccb9..3805da52 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -26,6 +26,7 @@ #include "bsplineXdir.cuh" #include "bsplineYdir.cuh" #include "bsplineZdir.cuh" +#include "findMinMax.cuh" #include "data_structures/APR/access/GenInfoGpuAccess.cuh" @@ -653,3 +654,46 @@ void cudaDownsampledGradient(PixelData &input, PixelData &grad, co runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream); } + + +template +std::pair cudaRunMinMax(PixelData &input_image) { + cudaStream_t aStream = nullptr; + + // Copy CPU image to CUDA mem + ScopedCudaMemHandler, H2D> cudaImage(input_image, aStream); + + // In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048) + // Calculate number of blocks to saturate whole SMs + // Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + const int smCount = deviceProp.multiProcessorCount; + const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor; + constexpr int numOfThreads = 512; + const int numOfBlocksPerSM = numOfThreadsPerSM / 512; + const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8; + const size_t numOfElements = input_image.getDimension().size(); + int numOfBlocks = std::min(maxNumberOfBlocks, static_cast((numOfElements + numOfThreads -1) / numOfThreads) ); + + // Allocate memory for results both for CPU and GPU + VectorData minVector(true); + VectorData maxVector(true); + minVector.resize(numOfBlocks); + maxVector.resize(numOfBlocks); + ScopedCudaMemHandler resultsMin(minVector.data(), numOfBlocks, aStream); + ScopedCudaMemHandler resultsMax(maxVector.data(), numOfBlocks, aStream); + + // Run kernel and copy data back to CPU + runFindMinMax(cudaImage.get(), input_image.getDimension(), aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads); + resultsMin.copyD2H(); + resultsMax.copyD2H(); + waitForCuda(); + + // First values of minVector and maxVector contain min and max of all data + return std::pair(minVector[0], maxVector[0]); +} + +template std::pair cudaRunMinMax(PixelData &); +template std::pair cudaRunMinMax(PixelData &); + diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 8e3d3d7d..ac4cd7e1 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -33,6 +33,7 @@ void computeLevelsCuda(const PixelData &grad_temp, PixelData & template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par); void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz); +template std::pair cudaRunMinMax(PixelData &input_image); template class GpuProcessingTask { diff --git a/src/algorithm/findMinMax.cuh b/src/algorithm/findMinMax.cuh new file mode 100644 index 00000000..b76dbd32 --- /dev/null +++ b/src/algorithm/findMinMax.cuh @@ -0,0 +1,134 @@ +#ifndef FIND_MIN_MAX_CUH +#define FIND_MIN_MAX_CUH + +#include "misc/CudaTools.cuh" +#include + +/** + * This kernel finds the minimum and maximum values in the input data array. + * Each block processes a portion of the data and writes the minimum and maximum + * values it finds to the resultsMin and resultsMax arrays. + * + * It requires 2*numOfThreads*sizeof(T) of shared memory + * + * @param in - input data + * @param len - length of input data + * @param resultsMin - output array for minimum values per block + * @param resultsMax - output array for maximum values per block + */ +template +__global__ void findMinMax(const T *in, const size_t len, T* resultsMin, T* resultsMax) { + // Compute initial indices + const int numOfThreads = blockDim.x; + size_t idx = threadIdx.x; + size_t globalIdx = blockIdx.x * blockDim.x + threadIdx.x; + + // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types + // used as APR like uint16_t or float etc. are aligned properly + extern __shared__ uint64_t array[]; + T *minValPerThread = reinterpret_cast(array); + T *maxValPerThread = reinterpret_cast(array) + numOfThreads; + + // Set initial values for min and max + minValPerThread[idx] = cuda::std::numeric_limits::max(); + maxValPerThread[idx] = cuda::std::numeric_limits::min(); + + // Read from global memory and compute min and max + for (size_t i = globalIdx; i < len; i += gridDim.x * blockDim.x) { + auto val = in[i]; + if (val < minValPerThread[idx]) minValPerThread[idx] = val; + if (val > maxValPerThread[idx]) maxValPerThread[idx] = val; + } + + // Wait for all threads in block to finish + __syncthreads(); + + // First thread should go through the shared memory and find the global min and max + // All that work is done only by single thread but it is fast enough to keep it simple + if (idx == 0) { + T globalMin = minValPerThread[0]; + T globalMax = maxValPerThread[0]; + for (int i = 1; i < numOfThreads; ++i) { + auto vmin = minValPerThread[i]; + if (vmin < globalMin) globalMin = vmin; + auto vmax = maxValPerThread[i]; + if (vmax > globalMax) globalMax = vmax; + } + + // Store results to global memory + resultsMin[blockIdx.x] = globalMin; + resultsMax[blockIdx.x] = globalMax; + } +} + +/** + * This kernel takes the intermediate min and max results from each block and computes the final + * minimum and maximum values across all blocks. Results are stored in the first element of resultsMin and resultsMax. + * + * This kernel requires 2*numOfBlocks*sizeof(T) of shared memory. + * + * @param resultsMin - intermediate minimum values from each block + * @param resultsMax - intermediate maximum values from each block + * @param numOfBlocks - number of blocks used in 'findMinMax' kenel (size of resultsMin and resultsMax) + */ +template +__global__ void findMinMaxFinal(T* resultsMin, T* resultsMax, int numOfBlocks) { + + // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types + // used as APR like uint16_t or float etc. are aligned properly + extern __shared__ uint64_t array2[]; + T *minValPerThread = reinterpret_cast(array2); + T *maxValPerThread = reinterpret_cast(array2) + numOfBlocks; + + size_t idx = threadIdx.x; + + // Read all data with all threads to shared memory + for (size_t i = idx; i < numOfBlocks; i += blockDim.x) { + minValPerThread[i] = resultsMin[i]; + maxValPerThread[i] = resultsMax[i]; + } + + // Wait for all threads to finish + __syncthreads(); + + //First thread should go through the shared memory and find the global min and max + if (idx == 0) { + T globalMin = minValPerThread[0]; + T globalMax = maxValPerThread[0]; + for (int i = 1; i < numOfBlocks; ++i) { + auto vmin = minValPerThread[i]; + if (vmin < globalMin) globalMin = vmin; + auto vmax = maxValPerThread[i]; + if (vmax > globalMax) globalMax = vmax; + } + // store results to global memory + resultsMin[0] = globalMin; + resultsMax[0] = globalMax; + } +} + + +/** + * Compute min and max values in the cudaInput array. + * + * numOfBlocks and numOfThreads are computed outside of this function to allow finding the optimal values (number of SMs) + * and allocating resultsMin and resultsMax arrays only once and then reuse. + * + * @param cudaInput - input data in device memory + * @param inputDim - dimensions of the input data + * @param aStream - cuda stream to use + * @param resultsMin - output array for minimum value, should have numOfBlocks elements + * @param resultsMax - output array for maximum value, should have numOfBlocks elements + * @param numOfBlocks - number of blocks to use + * @param numOfThreads - number of threads per block + */ +template +void runFindMinMax(const T *cudaInput, PixelDataDim inputDim, cudaStream_t aStream, T* resultsMin, T* resultsMax, int numOfBlocks, int numOfThreads) { + const size_t numOfElements = inputDim.size(); + + findMinMax<<>> (cudaInput, numOfElements, resultsMin, resultsMax); + findMinMaxFinal<<<1, 1024, 2*numOfBlocks*sizeof(T), aStream>>> (resultsMin, resultsMax, numOfBlocks); +} + + +#endif diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 588c5ea3..5c88dcea 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -359,6 +359,54 @@ namespace { EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } + + + + TEST(ComputeThreshold, TEST_FIND_MIN_MAX) { + // Sizes of input data to test + std::vector> allSizes = {{2, 1, 1}, + {146, 321, 137}, + {512, 512, 512}, + {127,1, 1}, + {129, 1, 1}}; + + for (auto &p : allSizes) { + int yLen = std::get<0>(p); + int xLen = std::get<1>(p); + int zLen = std::get<2>(p); + + + // Generate input image + using ImageType = uint16_t; + PixelData input_image = getRandInitializedMesh(yLen, xLen, zLen, 15, 20, true); + // Set whole input_image to 1001 + for (size_t i = 0; i < input_image.mesh.size(); ++i) { + input_image.mesh[i] = 1001; + } + + const int hiValue = 5000; + const int lowValue = 666; + + // Add two random pixels with some max and min value + srand((unsigned)time(0)); + int randIndexMax = rand() % input_image.mesh.size(); + input_image.mesh[randIndexMax] = hiValue; + int randIndexMin = rand() % input_image.mesh.size(); + // Make sure min and max indices are not the same + if (randIndexMin == randIndexMax) { + randIndexMin = (randIndexMin + 1) % input_image.mesh.size(); + } + input_image.mesh[randIndexMin] = lowValue; + // Print indices in case of debugging + std::cout << "Position of max and min values: " << randIndexMax << " " << randIndexMin << std::endl; + + // Function under test + auto res = cudaRunMinMax(input_image); + + EXPECT_EQ(res.first, lowValue); + EXPECT_EQ(res.second, hiValue); + } + } #endif // APR_USE_CUDA } From 4093552d786dcc55b5854fbf7e16d051c7c4b097 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 9 Sep 2025 13:59:49 +0200 Subject: [PATCH 74/80] Bspline Offset is now computed on GPU + copy of original image for sampling --- src/algorithm/APRConverter.hpp | 134 +-------------------------- src/algorithm/ComputeGradientCuda.cu | 96 ++++++++++++++++++- test/FullPipelineCudaTest.cpp | 1 - 3 files changed, 96 insertions(+), 135 deletions(-) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 8ff3b674..a2ebd480 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -76,8 +76,6 @@ class APRConverter { template bool get_apr_cuda(APR &aAPR, PixelData &input_image); template - bool get_apr_cuda_streams(APR &aAPR, PixelData &input_image); - template bool get_apr_cuda_multistreams(APR &aAPR, const std::vector *> &input_images, int numOfStreams = 3); #endif @@ -406,25 +404,13 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image) + PixelData image_temp(input_image, true /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image) ///////////////////////////////// /// Pipeline //////////////////////// - // offset image by factor (this is required if there are zero areas in the background with - // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow! - - if (std::is_floating_point::value) { - image_temp.copyFromMesh(input_image); - } else { - bspline_offset = compute_bspline_offset(input_image, par.lambda); - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } GpuProcessingTask gpt(image_temp, local_scale_temp, par, aAPR.level_max()); - // std::cout << "after gpt \n"; - gpt.setBsplineOffset(bspline_offset); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); @@ -462,7 +448,7 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const return false; } - // Reduce number of streams to number of images if there are less images than streams + // Reduce number of streams to number of images if there are fewer images than streams if (numOfImages < numOfStreams) numOfStreams = numOfImages; // Use first image to initialize the APR - all other images should have the same dimensions @@ -476,7 +462,7 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const std::vector> tempImages; std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl; for (int i = 0; i < numOfStreams; ++i) { - tempImages.emplace_back(PixelData(*input_image, false /* don't copy */, true /* pinned memory */)); + tempImages.emplace_back(PixelData(*input_image, true /* copy */, true /* pinned memory */)); } ///////////////////////////////// @@ -497,22 +483,10 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const t.start_timer("GPU processing..."); // Saturate all the streams with first images for (int i = 0; i < numOfStreams; ++i) { - - // offset image by factor (this is required if there are zero areas in the background with - // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow! - if (std::is_floating_point::value) { - tempImages[i].copyFromMesh(*input_images[i]); - } else { - bspline_offset = compute_bspline_offset(*input_images[i], par.lambda); - tempImages[i].copyFromMeshWithUnaryOp(*input_images[i], [=](const auto &a) { return (a + bspline_offset); }); - } std::cout << "Processing image " << i << " on stream " << i << std::endl; - gpts[i].setBsplineOffset(bspline_offset); gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); } - // Main loop - get results from GPU and send new images to the streams (if any left) for (int s = 0; s < numOfImages; ++s) { int streamNum = s % numOfStreams; @@ -525,14 +499,8 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const // We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images if (s < numOfImages - numOfStreams) { int imageToProcess = s + numOfStreams; - if (std::is_floating_point::value) { - tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]); - } else { - bspline_offset = compute_bspline_offset(*input_images[imageToProcess], par.lambda); - tempImages[streamNum].copyFromMeshWithUnaryOp(*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); }); - } + tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]); std::cout << "Processing image " << imageToProcess << " on stream " << streamNum << std::endl; - gpts[streamNum].setBsplineOffset(bspline_offset); gpts_futures[streamNum] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[streamNum]); } @@ -554,97 +522,6 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const std::cout << "CUDA multistream pipeline finished!\n"; return true; } - -/** - * Implementation of pipeline for GPU/CUDA and multiple streams - * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be. - * Finally, it should be able to process incoming stream of data (sequence of images). - * - * @param aAPR - the APR data structure - * @param input_image - input image - */ -template template -inline bool APRConverter::get_apr_cuda_streams(APR &aAPR, PixelData& input_image) { - // Initialize APR and memory for the pipeline - if (!initPipelineAPR(aAPR, input_image)) return false; - initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image) - - ///////////////////////////////// - /// Pipeline - ///////////////////////////////// - - // offset image by factor (this is required if there are zero areas in the background with - // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow! - if (std::is_floating_point::value) { - image_temp.copyFromMesh(input_image); - } else { - bspline_offset = compute_bspline_offset(input_image, par.lambda); - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } - - // Run input on the GPU streams - constexpr int numOfStreams = 3; // number of streams to use for parallel processing - constexpr int repetitionsPerStream = 3; // number of repetitions per stream to simulate processing of multiple images - - APRTimer ttt(true); - - ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY"); - { - APRTimer t(true); - std::vector> gpts; - - t.start_timer("Creating GPTS"); - std::vector> gpts_futures; gpts_futures.resize(numOfStreams); - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, aAPR.level_max())); - } - t.stop_timer(); - - t.start_timer("-----------------------------> Whole GPU pipeline with repetitions"); - { - APRTimer tt(false); - // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline - for (int i = 0; i < numOfStreams; ++i) { - gpts[i].setBsplineOffset(bspline_offset); - gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[i]); - } - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - // Get data from GpuProcessingTask - get() will block until the task is finished - gpts_futures[c].get(); - auto linearAccessGpu = gpts[c].getDataFromGpu(); - - // in theory, we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].setBsplineOffset(bspline_offset); - gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask::processOnGpu, &gpts[c]); - } - - // Fill APR data structure with data from GPU - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); - - aAPR.apr_initialized = true; - } - } - auto allT = t.stop_timer(); - std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n"; - std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n"; - } - auto allT = ttt.stop_timer(); - float tpi = allT / (numOfStreams*repetitionsPerStream); - std::cout << "Time per image: " << tpi << " seconds\n"; - std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n"; - std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n"; - - return true; -} #endif @@ -719,8 +596,7 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag return get_apr_cpu(aAPR, input_image); #else // return get_apr_cuda(aAPR, input_image); - // return get_apr_cuda_streams(aAPR, input_image); - std::vector *> input_images(3*66, &input_image); + std::vector *> input_images(1, &input_image); return get_apr_cuda_multistreams(aAPR, input_images, 3); #endif } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 3805da52..0fffe458 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -289,6 +289,55 @@ void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cu rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax); } +/** + * Compute bspline offset for APRConverter of integer type ImageType + */ +template +float computeBsplineOffset(T *cudaImage, PixelDataDim dim, float lambda, int numOfBlocks, ScopedCudaMemHandler &resultsMin, ScopedCudaMemHandler &resultsMax, VectorData &minVector, VectorData &maxVector, cudaStream_t aStream) { + + // if bspline smoothing is disabled, there is no need for an offset + if(lambda <= 0) return 0; + + // Run kernel and copy data back to CPU + runFindMinMax(cudaImage, dim, aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads); + resultsMin.copyD2H(); + resultsMax.copyD2H(); + checkCuda(cudaStreamSynchronize(aStream)); + + // compute offset to center the intensities in the ImageType range (can be negative) + float offset = (std::numeric_limits::max() - (maxVector[0] - minVector[0])) / 2 - minVector[0]; + + // clamp the offset to [-100, 100] + return std::max(std::min(offset, 100.f), -100.f); +} + + +/** + * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. + * @param input + * @param output + * @param length - len of input/output arrays + * @param thresholdLevel + */ +template +__global__ void bsplineOffsetAndCopyOriginal(T *input, T *copy, size_t length, float bspline_offset) { + size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; + + if (idx < length) { + auto v = input[idx]; + copy[idx] = v; + input[idx] = v + bspline_offset; + } +} + +template +void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, float bspline_offset, const PixelDataDim &dim, cudaStream_t aStream) { + dim3 threadsPerBlock(128); + dim3 numBlocks((dim.size() + threadsPerBlock.x - 1)/threadsPerBlock.x); + bsplineOffsetAndCopyOriginal<<>>(cudaImage, cudaCopy, dim.size(), bspline_offset); +}; + + class CudaStream { cudaStream_t iStream; @@ -332,6 +381,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { // cuda stuff - memory and stream to be used ScopedCudaMemHandler, JUST_ALLOC> image; + ScopedCudaMemHandler, JUST_ALLOC> imageSampling; ScopedCudaMemHandler, JUST_ALLOC> gradient; ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp; ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp2; @@ -373,6 +423,13 @@ class GpuProcessingTask::GpuProcessingTaskImpl { GenInfoGpuAccess giga; uint64_t counter_total = 1; + // Preallocated memory for bspline shift computation + VectorData minVector{true}; + VectorData maxVector{true}; + ScopedCudaMemHandler resultsMin; + ScopedCudaMemHandler resultsMax; + int numOfBlocks; + public: // TODO: Remove need for passing 'levels' to GpuProcessingTask @@ -383,6 +440,7 @@ public: iCpuLevels(levels), iStream(cudaStream.get()), image (inputImage, iStream), + imageSampling (inputImage, iStream), gradient (levels, iStream), local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), @@ -435,6 +493,26 @@ public: isErrorDetectedPinned.resize(1); isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream); + + + + // In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048) + // Calculate number of blocks to saturate whole SMs + // Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + const int smCount = deviceProp.multiProcessorCount; + const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor; + constexpr int numOfThreads = 512; + const int numOfBlocksPerSM = numOfThreadsPerSM / 512; + const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8; + const size_t numOfElements = inputImage.getDimension().size(); + numOfBlocks = std::min(maxNumberOfBlocks, static_cast((numOfElements + numOfThreads -1) / numOfThreads) ); + + minVector.resize(numOfBlocks); + maxVector.resize(numOfBlocks); + resultsMin.initialize(minVector.data(), numOfBlocks, iStream); + resultsMax.initialize(maxVector.data(), numOfBlocks, iStream); } LinearAccessCudaStructs getDataFromGpu() { @@ -442,6 +520,8 @@ public: } void processOnGpu() { + + // Set it and copy first before copying the image // It improves *a lot* performance even though it is needed later in computeLinearStructureCuda() iAprInfo.total_number_particles = 0; // reset total_number_particles to 0 @@ -450,6 +530,17 @@ public: image.copyH2D(); + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! + if (std::is_floating_point::value) { + iBsplineOffset = 0; + } else { + iBsplineOffset = computeBsplineOffset(image.get(), iCpuImage.getDimension(), iParameters.lambda, numOfBlocks, resultsMin, resultsMax, minVector, maxVector, iStream); + } + runBsplineOffsetAndCopyOriginal(image.get(), imageSampling.get(), iBsplineOffset /*bspline_offset*/, iCpuImage.getDimension(), iStream); + + getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); @@ -490,8 +581,6 @@ public: lacs.y_vec.copy(y_vec); } - void setBsplineOffset(float offset) {iBsplineOffset = offset;} - ~GpuProcessingTaskImpl() {} }; @@ -511,9 +600,6 @@ LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return imp template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} -template -void GpuProcessingTask::setBsplineOffset(float offset) {impl->setBsplineOffset(offset);} - // explicit instantiation of handled types template class GpuProcessingTask; template class GpuProcessingTask; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index a86edbdc..ad7d486e 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -340,7 +340,6 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel); - gpt.setBsplineOffset(bspline_offset); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); From e686349d5e482230ef64bca657164b26e815b210 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 11 Sep 2025 16:12:32 +0200 Subject: [PATCH 75/80] Added some useful methods to GenInfo (size and dimension) --- src/data_structures/APR/GenInfo.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index 8d5da2bd..5c46965b 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -37,7 +37,14 @@ class GenInfo { GenInfo() {} GenInfo(const PixelDataDim &dim) { init(dim); } - size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; } + /* Returns the size of the original image at a given level */ + size_t getSize(int level) const { return (size_t)y_num[level] * x_num[level] * z_num[level]; } + /* Returns the size of the original image at max level*/ + size_t getSize() const { return getSize(l_max); } + /* Returns the dimensions of the original image at a given level */ + PixelDataDim getDimension(int level) const { return PixelDataDim(y_num[level], x_num[level], z_num[level]); } + /* Returns the dimensions of the original image at max level */ + PixelDataDim getDimension() const { return getDimension(l_max); } //initialize the information given the original dimensions void init(const PixelDataDim &dim) { From 1db4abf0290ba274f476530b5e3914226004794c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 12 Sep 2025 16:09:59 +0200 Subject: [PATCH 76/80] Sampling on GPU added - THIS CODE is to being cleaned up - it is pushed for backup reasons only --- examples/Example_get_apr.cpp | 9 ++ src/algorithm/APRConverter.hpp | 8 +- src/algorithm/ComputeGradientCuda.cu | 112 +++++++++++++++++- src/algorithm/ComputeGradientCuda.hpp | 2 +- .../APR/access/LinearAccessCuda.cu | 12 +- .../APR/access/LinearAccessCuda.hpp | 17 ++- test/FullPipelineCudaTest.cpp | 2 +- test/LinearAccessCudaTest.cpp | 6 +- 8 files changed, 152 insertions(+), 16 deletions(-) diff --git a/examples/Example_get_apr.cpp b/examples/Example_get_apr.cpp index bfbaf31d..7bf61887 100644 --- a/examples/Example_get_apr.cpp +++ b/examples/Example_get_apr.cpp @@ -81,6 +81,15 @@ int runAPR(cmdLineOptions options) { ParticleData particle_intensities; particle_intensities.sample_image(apr, input_img); // sample your particles from your image //Below is IO and outputting of the Implied Resolution Function through the Particle Cell level. + std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl; + std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl; + std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl; + + for (int i = 0 ; i < particle_intensities.size(); ++i) { + if (particle_intensities[i] != aprConverter.parts[i]) { + std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl; + } + } //output std::string save_loc = options.output_dir; diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index a2ebd480..9c1abd76 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -66,6 +66,10 @@ class APRConverter { APRTimer computation_timer; APRParameters par; + // TODO: this is temporary place to put particle intensity data. It shoud be think over how to move it from GPU + // but for now and tests this is the best place. + VectorData parts; + template bool get_apr(APR &aAPR, PixelData &input_image); @@ -420,6 +424,7 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + parts.copy(linearAccessGpu.parts); aAPR.apr_initialized = true; std::cout << "CUDA pipeline finished!\n"; @@ -509,6 +514,7 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec); aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); + parts = std::move(linearAccessGpu.parts); aAPR.apr_initialized = true; } @@ -596,7 +602,7 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag return get_apr_cpu(aAPR, input_image); #else // return get_apr_cuda(aAPR, input_image); - std::vector *> input_images(1, &input_image); + std::vector *> input_images(3*11, &input_image); return get_apr_cuda_multistreams(aAPR, input_images, 3); #endif } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 0fffe458..010f7fc9 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -338,6 +338,62 @@ void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, floa }; +template +__global__ void printKernel(T *input, size_t length) { + printf("DOWNSAMPLED: "); + for (int i = 0; i < length; i++) printf("%d ", input[i]); + printf("\n"); +} + +template +void runPrint(ImgType *cudaImage, size_t length, cudaStream_t aStream) { + printKernel<<<1,1, 0, aStream>>>(cudaImage, length); +}; + + +template +__global__ void sampleKernel(T *downsampledLevel, T *parts_cuda, int level, int xLen, int yLen, int zLen, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec) { + const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; + const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; + if (xi >= xLen || zi >= zLen) return; + uint64_t level_start = level_xz_vec_cuda[level]; + uint64_t offset = xi + zi * xLen; + auto xz_start = level_start + offset; + + auto begin_index = xz_end_vec_cuda[xz_start - 1]; + auto end_index = xz_end_vec_cuda[xz_start]; + + for (size_t idx = begin_index; idx < end_index; ++idx) { + int y = y_vec[idx]; + size_t imageIdx = zi * xLen * yLen + xi * yLen + y; + parts_cuda[idx] = downsampledLevel[imageIdx]; + } +} + +template +void runSampleParts(ImgType** downsampled, GenInfo &aprInfo, ImgType *parts_cuda, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec, cudaStream_t aStream) { + // std::cout << aprInfo << std::endl; + // Run kernels for each level + for (int level = aprInfo.l_min; level <= aprInfo.l_max; level++) { + // std::cout << "Processing level " << level << std::endl; + dim3 threadsPerBlock(128, 1, 8); + dim3 numBlocks((aprInfo.x_num[level] + threadsPerBlock.x - 1) / threadsPerBlock.x, + 1, + (aprInfo.z_num[level] + threadsPerBlock.z - 1) / threadsPerBlock.z); + // std::cout << downsampled[level] << std::endl; + // std::cout << parts_cuda << std::endl; + // std::cout << aprInfo.x_num[level] << std::endl; + // std::cout << aprInfo.y_num[level] << std::endl; + // std::cout << aprInfo.z_num[level] << std::endl; + // std::cout << level_xz_vec_cuda << std::endl; + // std::cout << xz_end_vec_cuda << std::endl; + // std::cout << y_vec << std::endl; + sampleKernel<<>>(downsampled[level], parts_cuda, level, aprInfo.x_num[level], aprInfo.y_num[level], aprInfo.z_num[level], level_xz_vec_cuda, xz_end_vec_cuda, y_vec); + } + +}; + + class CudaStream { cudaStream_t iStream; @@ -407,7 +463,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { ParticleCellTreeCuda pctc; ScopedCudaMemHandler y_vec_cuda; // for LinearAccess - LinearAccessCudaStructs lacs; + LinearAccessCudaStructs lacs; // Padded memory for local_scale_temp and local_scale_temp2 ScopedCudaMemHandler lstPadded; @@ -422,6 +478,8 @@ class GpuProcessingTask::GpuProcessingTaskImpl { ScopedCudaMemHandler level_xz_vec_cuda; //(level_xz_vec.data(), level_xz_vec.size(), aStream); GenInfoGpuAccess giga; uint64_t counter_total = 1; + VectorData parts; + ScopedCudaMemHandler parts_cuda; // Preallocated memory for bspline shift computation VectorData minVector{true}; @@ -455,11 +513,13 @@ public: boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), - y_vec_cuda(nullptr, iAprInfo.getSize(), iStream), + y_vec_cuda(nullptr, iAprInfo.getSize()/2, iStream), // TODO: only half capacity xz_end_vec(true), level_xz_vec(true), y_vec(true), - giga(iAprInfo, iStream) + giga(iAprInfo, iStream), + parts(true), + parts_cuda(nullptr, iAprInfo.getSize()/2, iStream) // TODO: only half capacity { splineCudaX = cudax.first; splineCudaY = cuday.first; @@ -491,6 +551,9 @@ public: xz_end_vec_cuda.initialize(xz_end_vec.data(), xz_end_vec.size(), iStream); level_xz_vec_cuda.initialize(level_xz_vec.data(), level_xz_vec.size(), iStream); + parts.resize(iAprInfo.getSize()); // resize it to worst case -> same number particles as pixels in input image + + isErrorDetectedPinned.resize(1); isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream); @@ -515,7 +578,34 @@ public: resultsMax.initialize(maxVector.data(), numOfBlocks, iStream); } - LinearAccessCudaStructs getDataFromGpu() { + void sample() { + // Prepare memory for downsampled pyramid + // Use 'image' as a memory for all levels (but max one) + // since data there is 'destroyed' anyway + // via bspline filtering and gradient computation + // and as the highest level of pyramid use imageSampling which is + // a copy of original image at full resolution + int l_max = iAprInfo.l_max; + int l_min = iAprInfo.l_min; + ImgType* downsampled[l_max + 1]; + downsampled[l_max] = imageSampling.get(); + size_t levelOffset = 0; + for (int l = l_max-1; l >= l_min; --l) { + size_t level_size = iAprInfo.x_num[l] * iAprInfo.y_num[l] * iAprInfo.z_num[l]; + // std::cout << l << " dim: " << iAprInfo.getDimension(l) << " " << iAprInfo.getSize(l) << " " << level_size << std::endl; + downsampled[l] = image.get() + levelOffset; + levelOffset += iAprInfo.getSize(l); + + runDownsampleMean(downsampled[l+1], downsampled[l], iAprInfo.x_num[l+1], iAprInfo.y_num[l+1], iAprInfo.z_num[l+1], iStream); + } + + // VectorData xz_end_vec; + // VectorData level_xz_vec; + // VectorData y_vec; + runSampleParts(downsampled, iAprInfo, parts_cuda.get(), level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), iStream); + } + + LinearAccessCudaStructs getDataFromGpu() { return std::move(lacs); } @@ -572,6 +662,16 @@ public: // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream)); + + // SAMPLE under development + sample(); + parts.resize(iAprInfo.total_number_particles); + // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures + checkCuda(cudaMemcpyAsync(parts.begin(), parts_cuda.get(), iAprInfo.total_number_particles * sizeof(ImgType), cudaMemcpyDeviceToHost, iStream)); + + + + // Synchornize last time - at that moment all data from GPU is copied to CPU checkCuda(cudaStreamSynchronize(iStream)); @@ -579,6 +679,7 @@ public: lacs.xz_end_vec.copy(xz_end_vec); lacs.level_xz_vec.copy(level_xz_vec); lacs.y_vec.copy(y_vec); + lacs.parts.copy(parts); } ~GpuProcessingTaskImpl() {} @@ -595,7 +696,7 @@ template GpuProcessingTask::GpuProcessingTask(GpuProcessingTask&&) = default; template -LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} +LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} @@ -606,6 +707,7 @@ template class GpuProcessingTask; template class GpuProcessingTask; template class GpuProcessingTask; + // ================================== TEST helpers ============== // TODO: should be moved somewhere diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index ac4cd7e1..5a44de0f 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -47,7 +47,7 @@ class GpuProcessingTask { ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); - LinearAccessCudaStructs getDataFromGpu(); + LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); void setBsplineOffset(float bspline_offset); diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 5695f4c1..1e2a04b1 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -460,7 +460,8 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu * - copy it back to CPU * - returns all the structure */ -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { +template +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { cudaStream_t aStream = nullptr; @@ -526,7 +527,7 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara p_map.downloadPCTfromGPU(pct); - LinearAccessCudaStructs lac; + LinearAccessCudaStructs lac; lac.y_vec.swap(y_vec); lac.xz_end_vec.swap(xz_end_vec); lac.level_xz_vec.swap(level_xz_vec); @@ -534,6 +535,13 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara return lac; } +// explicit instantiation of handled types +template LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +template LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +template LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +template LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); + + void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream) { const uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 69ce4ebe..fb45130d 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -6,16 +6,27 @@ #include "data_structures/APR/GenInfo.hpp" #include "algorithm/ParticleCellTreeCuda.cuh" -typedef struct { +template +struct LinearAccessCudaStructs { VectorData y_vec; VectorData xz_end_vec; VectorData level_xz_vec; -} LinearAccessCudaStructs; + + // temporarily added + VectorData parts; +}; + +// explicit instantiation of handled types +template class LinearAccessCudaStructs; +template class LinearAccessCudaStructs; +template class LinearAccessCudaStructs; +template class LinearAccessCudaStructs; #include "data_structures/APR/access/GenInfoGpuAccess.cuh" // This is for testing purposes only -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +template +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream); diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index ad7d486e..9500545e 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -263,7 +263,7 @@ namespace { getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu); - auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index eb91e7bd..2c7026c2 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -272,7 +272,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevels) { par.neighborhood_optimization = true; // --- Method under test - auto linearAccess = initializeLinearStructureCuda(gi, par, pct); + auto linearAccess = initializeLinearStructureCuda(gi, par, pct); // ---- Verify output std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz @@ -312,7 +312,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { // --- Method under test linearAccess.initialize_linear_structure(par, pct); - auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); @@ -359,7 +359,7 @@ TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) { t.stop_timer(); t.start_timer("_________________________ GPU"); - auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); t.stop_timer(); From 8415a528db1778b33f7a4fff5215d861648f91c4 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 10 Nov 2025 12:15:19 +0100 Subject: [PATCH 77/80] Reverting CUDA-pipeline related changes --- examples/Example_get_apr.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/Example_get_apr.cpp b/examples/Example_get_apr.cpp index 7bf61887..bfbaf31d 100644 --- a/examples/Example_get_apr.cpp +++ b/examples/Example_get_apr.cpp @@ -81,15 +81,6 @@ int runAPR(cmdLineOptions options) { ParticleData particle_intensities; particle_intensities.sample_image(apr, input_img); // sample your particles from your image //Below is IO and outputting of the Implied Resolution Function through the Particle Cell level. - std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl; - std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl; - std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl; - - for (int i = 0 ; i < particle_intensities.size(); ++i) { - if (particle_intensities[i] != aprConverter.parts[i]) { - std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl; - } - } //output std::string save_loc = options.output_dir; From 629f80668a806362405798fb0911cb80f3c5e4cb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 10 Nov 2025 17:08:15 +0100 Subject: [PATCH 78/80] Initial draft of Example_get_multiapr --- examples/CMakeLists.txt | 1 + examples/Example_get_multiapr.cpp | 203 ++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 examples/Example_get_multiapr.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5837784a..e4d3ca99 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,6 +15,7 @@ buildTarget(Example_apr_filter) buildTarget(Example_apr_deconvolution) buildTarget(Example_random_access) buildTarget(Example_lazy_access) +buildTarget(Example_get_multiapr) #buildTarget(Example_reconstruct_patch) #The way this is working is going to be re-designed. buildTarget(Example_apr_tree) diff --git a/examples/Example_get_multiapr.cpp b/examples/Example_get_multiapr.cpp new file mode 100644 index 00000000..15dfd217 --- /dev/null +++ b/examples/Example_get_multiapr.cpp @@ -0,0 +1,203 @@ +const char* usage = R"( +Form the APR form images: Takes an uint16_t input tiff images and forms the APRs and saves it as hdf5. +The hdf5 output of this program can be used with the other apr examples, and also viewed with HDFView. + +Usage: +====== +Example_get_multiapr -d input_directory [-od output_direcotry] + +Additional settings (High Level): +================================= +-I_th intensity_threshold (will ignore areas of image below this threshold, useful for removing camera artifacts or auto-fluorescence) +-sigma_th lower threshold for the local intensity scale +-grad_th ignore areas in the image where the gradient magnitude is lower than this value + +Advanced (Direct) Settings: +=========================== +-lambda lambda_value (directly set the value of the gradient smoothing parameter lambda (reasonable range 0.1-10, default: 3) +-rel_error rel_error_value (Reasonable ranges are from .08-.15), Default: 0.1 +-neighborhood_optimization_off turns off the neighborhood optimization (This results in boundary Particle Cells also being increased in resolution after the Pulling Scheme step) +)"; + +#include +#include +#include "ConfigAPR.h" +#include "io/APRFile.hpp" +#include "data_structures/APR/particles/ParticleData.hpp" +#include "data_structures/APR/APR.hpp" +#include "algorithm/APRConverter.hpp" + + +struct cmdLineOptions { + std::string directory = ""; + std::string output_dir = ""; + + float lambda = 3.0; + float Ip_th = 0; + float grad_th = 1; + float sigma_th = 0; + float rel_error = 0.1; + + bool neighborhood_optimization = true; +}; + +bool command_option_exists(const char **begin, const char **end, const std::string &option) +{ + return std::find(begin, end, option) != end; +} + +const char* get_command_option(const char **begin, const char **end, const std::string &option) +{ + if (const char** itr = std::find(begin, end, option); itr != end && ++itr != end) { + return *itr; + } + return nullptr; +} + +void printUsage() { + std::cerr << "APR version " << ConfigAPR::APR_VERSION << std::endl < aprConverter; + + // read in the command line options into the parameters file + aprConverter.par.input_dir = options.directory; + aprConverter.par.output_dir = options.output_dir; + + aprConverter.par.lambda = options.lambda; + aprConverter.par.Ip_th = options.Ip_th; + aprConverter.par.grad_th = options.grad_th; + aprConverter.par.sigma_th = options.sigma_th; + aprConverter.par.rel_error = options.rel_error; + + aprConverter.par.neighborhood_optimization = options.neighborhood_optimization; + + + + // TODO: read here all input files instead of options.input + PixelData input_img = TiffUtils::getMesh(options.directory + "TODO"); + + //Gets the APR + if(APR apr; aprConverter.get_apr(apr, input_img)){ + + ParticleData particle_intensities; + particle_intensities.sample_image(apr, input_img); // sample your particles from your image + +#ifdef APR_USE_CUDA + //Below is IO and outputting of the Implied Resolution Function through the Particle Cell level. + std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl; + std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl; + std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl; + + for (int i = 0 ; i < particle_intensities.size(); ++i) { + if (particle_intensities[i] != aprConverter.parts[i]) { + std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl; + } + } +#endif + + + //output + std::string save_loc = options.output_dir; + // TODO Change file_name to currently processed input file and add ".apr" + std::string file_name = "TODO_fileName"; + + APRTimer timer; + + timer.verbose_flag = true; + + std::cout << std::endl; + float original_pixel_image_size = 2.0f * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2) / 1000000.0f; + std::cout << "Original image size: " << original_pixel_image_size << " MB" << std::endl; + + timer.start_timer("writing output"); + + std::cout << "Writing the APR to hdf5..." << std::endl; + + //write the APR to hdf5 file + APRFile aprFile; + + aprFile.open(save_loc + file_name + ".apr"); + + aprFile.write_apr(apr, 0, "t", false); + aprFile.write_particles("particles",particle_intensities); + + float apr_file_size = aprFile.current_file_size_MB(); + + timer.stop_timer(); + + float computational_ratio = 1.0f * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2) / (1.0f * apr.total_number_particles()); + + std::cout << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << computational_ratio << std::endl; + std::cout << "Lossy Compression Ratio: " << original_pixel_image_size/apr_file_size << std::endl; + std::cout << std::endl; + } else { + std::cout << "Oops, something went wrong. APR not computed :(." << std::endl; + } + return 0; +} + + +int main(const int argc, const char **argv) { + const cmdLineOptions options = read_command_line_options(argc, argv); + const auto result = runAPR(options); + + return result; +} From 3a11834fa5b88fa9e191dd7aa80d290d01aee9fd Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 26 Nov 2025 16:59:16 +0100 Subject: [PATCH 79/80] Added Example_get_multiapr for CUDA parallel processing --- examples/CMakeLists.txt | 6 +- examples/Example_get_multiapr.cpp | 180 ++++++++++++++++--------- src/algorithm/APRConverter.hpp | 70 +++------- src/algorithm/ComputeGradientCuda.cu | 9 +- src/data_structures/Mesh/PixelData.hpp | 13 +- test/FullPipelineCudaTest.cpp | 1 + test/MeshDataTest.cpp | 14 ++ 7 files changed, 179 insertions(+), 114 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e4d3ca99..20422af9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,11 @@ buildTarget(Example_apr_filter) buildTarget(Example_apr_deconvolution) buildTarget(Example_random_access) buildTarget(Example_lazy_access) -buildTarget(Example_get_multiapr) + +#APR GPU Tests +if(APR_USE_CUDA) + buildTarget(Example_get_multiapr) +endif() #buildTarget(Example_reconstruct_patch) #The way this is working is going to be re-designed. buildTarget(Example_apr_tree) diff --git a/examples/Example_get_multiapr.cpp b/examples/Example_get_multiapr.cpp index 15dfd217..1bd2ef98 100644 --- a/examples/Example_get_multiapr.cpp +++ b/examples/Example_get_multiapr.cpp @@ -1,10 +1,10 @@ const char* usage = R"( -Form the APR form images: Takes an uint16_t input tiff images and forms the APRs and saves it as hdf5. +Converts images to APR format: Takes input directory with uint16_t input tiff images and generates the APRs and saves it as hdf5. The hdf5 output of this program can be used with the other apr examples, and also viewed with HDFView. Usage: ====== -Example_get_multiapr -d input_directory [-od output_direcotry] +Example_get_multiapr -d input_directory [-od output_directory] Additional settings (High Level): ================================= @@ -19,8 +19,12 @@ Advanced (Direct) Settings: -neighborhood_optimization_off turns off the neighborhood optimization (This results in boundary Particle Cells also being increased in resolution after the Pulling Scheme step) )"; -#include + #include +#include +#include +#include +#include #include "ConfigAPR.h" #include "io/APRFile.hpp" #include "data_structures/APR/particles/ParticleData.hpp" @@ -29,8 +33,8 @@ Advanced (Direct) Settings: struct cmdLineOptions { - std::string directory = ""; - std::string output_dir = ""; + std::string directory; + std::string output_dir; float lambda = 3.0; float Ip_th = 0; @@ -55,7 +59,7 @@ const char* get_command_option(const char **begin, const char **end, const std:: } void printUsage() { - std::cerr << "APR version " << ConfigAPR::APR_VERSION << std::endl < tif_files; + + try { + for (const auto& entry : fs::directory_iterator(directory_path)) { + if (entry.is_regular_file()) { + auto ext = entry.path().extension().string(); + if (ext == ".tif" || ext == ".tiff" || ext == ".TIF" || ext == ".TIFF") { + tif_files.push_back(entry.path()); + } + } + } + } catch (const fs::filesystem_error& e) { + std::cerr << "Filesystem error: " << e.what() << '\n'; + exit(2); + } + + return tif_files; +} + int runAPR(const cmdLineOptions &options) { - APRConverter aprConverter; + using ImgType = uint16_t; + using ImgContainer = PixelData; + + APRConverter aprConverter; // read in the command line options into the parameters file aprConverter.par.input_dir = options.directory; @@ -128,69 +157,100 @@ int runAPR(const cmdLineOptions &options) { aprConverter.par.neighborhood_optimization = options.neighborhood_optimization; - - // TODO: read here all input files instead of options.input - PixelData input_img = TiffUtils::getMesh(options.directory + "TODO"); - - //Gets the APR - if(APR apr; aprConverter.get_apr(apr, input_img)){ - - ParticleData particle_intensities; - particle_intensities.sample_image(apr, input_img); // sample your particles from your image - -#ifdef APR_USE_CUDA - //Below is IO and outputting of the Implied Resolution Function through the Particle Cell level. - std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl; - std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl; - std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl; - - for (int i = 0 ; i < particle_intensities.size(); ++i) { - if (particle_intensities[i] != aprConverter.parts[i]) { - std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl; - } + auto tifFiles = getTiffFilesFromDir(options.directory); + std::vector> input_images; + std::vector input_images_raw; + std::vector> APRs; + std::vector APRs_raw; + std::vector>> partIntensities; + std::vector *> partIntensities_raw; + + // Load all images from input directory, check if they have same resolution + // Also create APR and intensities objects to be filled by pipeline later + int firstOne = true; + PixelDataDim sizeOfInput; + for (const auto &file : tifFiles) { + // Read a file and store it, also keep a vector of raw pointers to read images since this is needed by APRConverter + input_images.push_back(std::make_unique(TiffUtils::getMesh(file))); + input_images_raw.push_back(input_images.back().get()); + if (firstOne) { + firstOne = false; + sizeOfInput = input_images.back().get()->getDimension(); + } + else if (input_images.back().get()->getDimension() != sizeOfInput) { + std::cerr << "Input images must have the same dimension." << std::endl; + exit(2); } -#endif - - //output - std::string save_loc = options.output_dir; - // TODO Change file_name to currently processed input file and add ".apr" - std::string file_name = "TODO_fileName"; + // We need as many APR objects as input images, and also raw pointer for APRConverter + APRs.push_back(std::make_unique(APR{})); + APRs_raw.push_back(APRs.back().get()); - APRTimer timer; + // And same for particle intensities... + partIntensities.push_back(std::make_unique>(VectorData{})); + partIntensities_raw.push_back(partIntensities.back().get()); + } - timer.verbose_flag = true; + std::cout << std::endl; + APRTimer timer(true); + timer.start_timer("GPU pipeline (mem allocation, processing, sampling) "); + if (aprConverter.get_apr_cuda_multistreams(APRs_raw, input_images_raw, partIntensities_raw)) { + timer.stop_timer(); + size_t numOfImages = input_images_raw.size(); std::cout << std::endl; - float original_pixel_image_size = 2.0f * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2) / 1000000.0f; - std::cout << "Original image size: " << original_pixel_image_size << " MB" << std::endl; - - timer.start_timer("writing output"); - - std::cout << "Writing the APR to hdf5..." << std::endl; - - //write the APR to hdf5 file - APRFile aprFile; - - aprFile.open(save_loc + file_name + ".apr"); - - aprFile.write_apr(apr, 0, "t", false); - aprFile.write_particles("particles",particle_intensities); - - float apr_file_size = aprFile.current_file_size_MB(); - timer.stop_timer(); + for (size_t i = 0; i < numOfImages; i++) { + std::cout << "Postprocessing " << i+1 << "/" << numOfImages << " image...\n"; + auto &apr = *APRs[i].get(); // currently process APR + auto &particle_intensities = *partIntensities[i].get(); // intensities sampled for current APR + + + // ------------ TODO: remove me later, this is quick test for Cpu vs Gpu before real test is written + // std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl; + // std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl; + // if (apr.linearAccess.y_vec.size() != particle_intensities.size()) {std::cerr << "CPU vs GPU number of particles differ!" << std::endl;} + ParticleData particle_intensities_cpu; + particle_intensities_cpu.sample_image(apr, *input_images[i].get()); // sample your particles from your image + for (size_t j = 0 ; j < particle_intensities.size(); ++j) { + if (particle_intensities_cpu[j] != particle_intensities[j]) { + std::cout << "Mismatch at " << j << " CPU: " << particle_intensities_cpu[j] << " GPU: " << particle_intensities[j] << std::endl; + } + } + // --------------------------------------------------------------------------------------------------- + + // Output name is like base of input filename + extension ".apr" + auto outputDir = std::filesystem::path(options.output_dir); + const std::filesystem::path& p(tifFiles[i]); + std::string outpuFileName = p.stem().string() + ".apr"; + + //write the APR to hdf5 file + timer.start_timer("writing output"); + APRFile aprFile; + aprFile.open(outputDir / outpuFileName); + aprFile.write_apr(apr, 0, "t", false); + ParticleData pd; + pd.data = std::move(particle_intensities); + aprFile.write_particles("particles",pd); + timer.stop_timer(); + + // Print some output statistics + float aprImageSizeInMB = aprFile.current_file_size_MB(); + double originalImageSizeInMB = sizeof(ImgType) * static_cast(apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / 1'000'000.0; + + std::cout << "Computational Ratio (Pixels/Particles): " << apr.computational_ratio() << std::endl; + std::cout << "Original / APR image size: " << originalImageSizeInMB << " / " << aprImageSizeInMB <<" MB" << std::endl; + std::cout << "Lossy Compression Ratio: " << originalImageSizeInMB/aprImageSizeInMB << std::endl; + std::cout << std::endl; + } + } + else { + std::cout << "Oops, something went wrong. APR not computed :(" << std::endl; + } - float computational_ratio = 1.0f * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2) / (1.0f * apr.total_number_particles()); + std::cout << "DONE!\n"; - std::cout << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << computational_ratio << std::endl; - std::cout << "Lossy Compression Ratio: " << original_pixel_image_size/apr_file_size << std::endl; - std::cout << std::endl; - } else { - std::cout << "Oops, something went wrong. APR not computed :(." << std::endl; - } return 0; } diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 9c1abd76..7f211e53 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -66,9 +66,6 @@ class APRConverter { APRTimer computation_timer; APRParameters par; - // TODO: this is temporary place to put particle intensity data. It shoud be think over how to move it from GPU - // but for now and tests this is the best place. - VectorData parts; template bool get_apr(APR &aAPR, PixelData &input_image); @@ -80,7 +77,7 @@ class APRConverter { template bool get_apr_cuda(APR &aAPR, PixelData &input_image); template - bool get_apr_cuda_multistreams(APR &aAPR, const std::vector *> &input_images, int numOfStreams = 3); + bool get_apr_cuda_multistreams(std::vector &aAPRs, std::vector *> &input_images, std::vector *> intensities, int numOfStreams = 3); #endif bool verbose = true; @@ -401,37 +398,11 @@ inline bool APRConverter::get_ds(APR &aAPR) { */ template template inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input_image) { - - if (!initPipelineAPR(aAPR, input_image)) return false; - - total_timer.start_timer("full_pipeline"); - initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - - computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, true /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image) - - ///////////////////////////////// - /// Pipeline - //////////////////////// - - GpuProcessingTask gpt(image_temp, local_scale_temp, par, aAPR.level_max()); - gpt.processOnGpu(); - auto linearAccessGpu = gpt.getDataFromGpu(); - - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - - // generateDatastructures(aAPR) for linearAcceess for CUDA - aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); - parts.copy(linearAccessGpu.parts); - aAPR.apr_initialized = true; - - std::cout << "CUDA pipeline finished!\n"; - - total_timer.stop_timer(); - - return true; + // Use CUDA version for multistreams, feed it with just one pixel + std::vector APRs(1, &aAPR); + std::vector *> input_images(1, &input_image); + std::vector *> intensisties(1, nullptr); + return get_apr_cuda_multistreams(APRs, input_images, intensisties, 1); } #endif @@ -446,7 +417,7 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input * @param numOfStreams - number of streams to use for parallel processing on GPU */ template template -inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const std::vector*> &input_images, int numOfStreams) { +inline bool APRConverter::get_apr_cuda_multistreams(std::vector &aAPRs, std::vector*> &input_images, std::vector *> intensities, int numOfStreams) { int numOfImages = input_images.size(); if (numOfImages == 0) { std::cerr << "No input images provided for APR conversion." << std::endl; @@ -459,13 +430,15 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const // Use first image to initialize the APR - all other images should have the same dimensions auto input_image = input_images[0]; - // Initialize APR and memory for the pipeline - if (!initPipelineAPR(aAPR, *input_image)) return false; + // Initialize APRs and memory for the pipeline + for (auto apr : aAPRs) { + if (!initPipelineAPR(*apr, *input_image)) return false; + } initPipelineMemory(input_image->y_num, input_image->x_num, input_image->z_num); // Create a temporary image for each stream std::vector> tempImages; - std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl; + std::cout << "Allocating memory for " << numOfStreams << " streams." << std::endl; for (int i = 0; i < numOfStreams; ++i) { tempImages.emplace_back(PixelData(*input_image, true /* copy */, true /* pinned memory */)); } @@ -480,7 +453,7 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const t.start_timer("Creating GPTS"); std::vector> gpts_futures; gpts_futures.resize(numOfStreams); for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(tempImages[i], local_scale_temp, par, aAPR.level_max())); + gpts.emplace_back(GpuProcessingTask(tempImages[i], local_scale_temp, par, aAPRs[0]->level_max())); } t.stop_timer(); @@ -510,13 +483,12 @@ inline bool APRConverter::get_apr_cuda_multistreams(APR &aAPR, const } // Fill APR data structure with data from GPU - aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec); - aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); - aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); - parts = std::move(linearAccessGpu.parts); - - aAPR.apr_initialized = true; + aAPRs[s]->aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); + aAPRs[s]->linearAccess.y_vec = std::move(linearAccessGpu.y_vec); + aAPRs[s]->linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec); + aAPRs[s]->linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec); + aAPRs[s]->apr_initialized = true; + if (intensities[s] != nullptr) *intensities[s] = std::move(linearAccessGpu.parts); } auto allT = t.stop_timer(); @@ -601,9 +573,7 @@ inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_imag #ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else - // return get_apr_cuda(aAPR, input_image); - std::vector *> input_images(3*11, &input_image); - return get_apr_cuda_multistreams(aAPR, input_images, 3); + return get_apr_cuda(aAPR, input_image); #endif } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 010f7fc9..9336c4e4 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -513,13 +513,13 @@ public: boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), - y_vec_cuda(nullptr, iAprInfo.getSize()/2, iStream), // TODO: only half capacity + y_vec_cuda(nullptr, iAprInfo.getSize() < 64 ? 64 : iAprInfo.getSize()/2, iStream), // TODO: only half capacity (64: since for levels <= 2 we compute all particles so we have in worst case at least (2^2)^3(dim) particles) xz_end_vec(true), level_xz_vec(true), y_vec(true), giga(iAprInfo, iStream), parts(true), - parts_cuda(nullptr, iAprInfo.getSize()/2, iStream) // TODO: only half capacity + parts_cuda(nullptr, iAprInfo.getSize() < 64 ? 64 : iAprInfo.getSize()/2, iStream) // TODO: only half capacity (64: since for levels <= 2 we compute all particles so we have in worst case at least (2^2)^3(dim) particles) { splineCudaX = cudax.first; splineCudaY = cuday.first; @@ -660,6 +660,11 @@ public: // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) and copy data from GPU y_vec.resize(iAprInfo.total_number_particles); // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures + std::cout << y_vec.size() << "\n"; + std::cout << iAprInfo.total_number_particles << "\n"; + std::cout << iStream << "\n"; + std::cout << y_vec_cuda.getSize() << std::endl; + std::cout << "----------" << std::endl; checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream)); diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index c1ef8215..e50f690d 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -34,7 +34,7 @@ struct PixelDataDim { size_t x; size_t z; - constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} + constexpr PixelDataDim(size_t y = 0, size_t x = 0, size_t z = 0) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } size_t maxDimSize() const { return std::max(x, std::max(y, z)); } @@ -520,6 +520,17 @@ public : return {static_cast(y_num), static_cast(x_num), static_cast(z_num)}; } + /** + * Returns downampled dimensions of PixelData + */ + PixelDataDim getDimensionDS() const { + const int z_num_ds = ceil(1.0*z_num/2.0); + const int x_num_ds = ceil(1.0*x_num/2.0); + const int y_num_ds = ceil(1.0*y_num/2.0); + + return {static_cast(y_num_ds), static_cast(x_num_ds), static_cast(z_num_ds)}; + } + /** * Creates copy of this mesh converting each element to new type * @tparam U new type of mesh diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 9500545e..3ffda9a2 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -285,6 +285,7 @@ namespace { constexpr PixelDataDim dim1{4, 4, 3}; constexpr PixelDataDim dim2{1024,512,512}; for (int d = 0; d <= 3; d++) { + std::cout << "#TEST: " << d << "\n"; auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index 20b1bbe3..b4b341c8 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -92,6 +92,20 @@ namespace { ASSERT_EQ(z.numOfDimensions(), 1); ASSERT_EQ(w.numOfDimensions(), 0); } + { // size provided - test downsampled size + PixelData md(10, 20, 30); + auto ds = md.getDimensionDS(); + ASSERT_EQ(ds.y, 5); + ASSERT_EQ(ds.x, 10); + ASSERT_EQ(ds.z, 15); + } + { // size provided not even numbers - test downsampled size + PixelData md(11, 23, 29); + auto ds = md.getDimensionDS(); + ASSERT_EQ(ds.y, 6); + ASSERT_EQ(ds.x, 12); + ASSERT_EQ(ds.z, 15); + } } TEST_F(VectorDataTest, InitTest) { From c20fac6023096c612d21eba84757876b5b19cb59 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 28 Nov 2025 15:44:24 +0100 Subject: [PATCH 80/80] localScaleTemp is not anymore passed to GpuProcessinTask, CPU memory init in APRConverter for GPU not needed anymore --- src/algorithm/APRConverter.hpp | 5 +-- src/algorithm/ComputeGradientCuda.cu | 57 ++++++++++++--------------- src/algorithm/ComputeGradientCuda.hpp | 2 +- src/algorithm/LocalIntensityScale.cu | 14 +++---- src/algorithm/LocalIntensityScale.cuh | 4 +- src/algorithm/LocalIntensityScale.hpp | 14 +++---- test/FullPipelineCudaTest.cpp | 3 +- 7 files changed, 44 insertions(+), 55 deletions(-) diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 7f211e53..437aa8ad 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -152,7 +152,7 @@ void APRConverter::initPipelineMemory(int y_num,int x_num,int z_num){ float not_needed; std::vector var_win; - iLocalIntensityScale.get_window_alt(not_needed, var_win, par, grad_temp); + iLocalIntensityScale.get_window_alt(not_needed, var_win, par, grad_temp.getDimension()); int padding_y = 2*std::max(var_win[0],var_win[3]); int padding_x = 2*std::max(var_win[1],var_win[4]); @@ -434,7 +434,6 @@ inline bool APRConverter::get_apr_cuda_multistreams(std::vector for (auto apr : aAPRs) { if (!initPipelineAPR(*apr, *input_image)) return false; } - initPipelineMemory(input_image->y_num, input_image->x_num, input_image->z_num); // Create a temporary image for each stream std::vector> tempImages; @@ -453,7 +452,7 @@ inline bool APRConverter::get_apr_cuda_multistreams(std::vector t.start_timer("Creating GPTS"); std::vector> gpts_futures; gpts_futures.resize(numOfStreams); for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(tempImages[i], local_scale_temp, par, aAPRs[0]->level_max())); + gpts.emplace_back(GpuProcessingTask(tempImages[i], par, aAPRs[0]->level_max())); } t.stop_timer(); diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 9336c4e4..6e849413 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -172,7 +172,7 @@ namespace { } template -void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, +void getGradientCuda(const PixelData &image, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, bool &isErrorDetected, ScopedCudaMemHandler& isErrorDetectedCuda, @@ -196,13 +196,14 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc "try squashing the input image to a narrower range or use APRConverter"); } } - runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); + auto localScaleTempDim = image.getDimensionDS(); // size of downsampled input image + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), localScaleTempDim, par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); if (par.lambda > 0) { - if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, localScaleTempDim.x, localScaleTempDim.y, localScaleTempDim.z, aStream); + if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, localScaleTempDim.x, localScaleTempDim.y, localScaleTempDim.z, aStream); + if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, localScaleTempDim.x, localScaleTempDim.y, localScaleTempDim.z, aStream); } } @@ -429,7 +430,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { // input data const PixelData &iCpuImage; - PixelData &iCpuLevels; + // PixelData &iCpuLevels; const APRParameters &iParameters; GenInfo iAprInfo; float iBsplineOffset = 0; @@ -438,9 +439,9 @@ class GpuProcessingTask::GpuProcessingTaskImpl { // cuda stuff - memory and stream to be used ScopedCudaMemHandler, JUST_ALLOC> image; ScopedCudaMemHandler, JUST_ALLOC> imageSampling; - ScopedCudaMemHandler, JUST_ALLOC> gradient; - ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp; - ScopedCudaMemHandler, JUST_ALLOC> local_scale_temp2; + ScopedCudaMemHandler gradient; + ScopedCudaMemHandler local_scale_temp; + ScopedCudaMemHandler local_scale_temp2; // bspline stuff @@ -490,18 +491,14 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: - // TODO: Remove need for passing 'levels' to GpuProcessingTask - // It was used during development to control internal computation like filters, gradient, levels etc. but - // once all is done there is no need for it anymore - GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, int maxLevel) : + GpuProcessingTaskImpl(const PixelData &inputImage, const APRParameters ¶meters, int maxLevel) : iCpuImage(inputImage), - iCpuLevels(levels), iStream(cudaStream.get()), image (inputImage, iStream), imageSampling (inputImage, iStream), - gradient (levels, iStream), - local_scale_temp (levels, iStream), - local_scale_temp2 (levels, iStream), + gradient (nullptr, inputImage.getDimensionDS().size(), iStream), + local_scale_temp (nullptr, inputImage.getDimensionDS().size(), iStream), + local_scale_temp2 (nullptr, inputImage.getDimensionDS().size(), iStream), iParameters(parameters), iAprInfo(iCpuImage.getDimension()), iMaxLevel(maxLevel), @@ -531,7 +528,7 @@ public: // In LIS we have: var_win[0,1,2] = maximum 3 var_win[3,4,5] = maximum 6 // so maximum paddSize is 6 6 6 PixelDataDim maxPaddSize(6, 6, 6); - PixelDataDim paddedImageSize = levels.getDimension() + maxPaddSize + maxPaddSize; + PixelDataDim paddedImageSize = inputImage.getDimensionDS() + maxPaddSize + maxPaddSize; lstPadded.initialize(nullptr, paddedImageSize.size(), iStream); lst2Padded.initialize(nullptr, paddedImageSize.size(), iStream); @@ -631,22 +628,23 @@ public: runBsplineOffsetAndCopyOriginal(image.get(), imageSampling.get(), iBsplineOffset /*bspline_offset*/, iCpuImage.getDimension(), iStream); - getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), + getGradientCuda(iCpuImage, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda, iBsplineOffset, iParameters, iStream); - runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream); + runLocalIntensityScalePipeline(iCpuImage.getDimensionDS(), iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream); // Apply parameters from APRConverter: - runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); - runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); - runThresholdOpen(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + auto dimOfLevels = iCpuImage.getDimensionDS(); // size of downsampled input image + runThreshold(local_scale_temp2.get(), gradient.get(), dimOfLevels.x, dimOfLevels.y, dimOfLevels.z, iParameters.Ip_th + iBsplineOffset, iStream); + runRescaleAndThreshold(local_scale_temp.get(), dimOfLevels.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + runThresholdOpen(gradient.get(), gradient.get(), dimOfLevels.x, dimOfLevels.y, dimOfLevels.z, iParameters.grad_th, iStream); // TODO: automatic parameters are not implemented for GPU pipeline (yet) float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; - runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); + runComputeLevels(gradient.get(), local_scale_temp.get(), dimOfLevels.size(), mult_const, iStream); computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); computeLinearStructureCuda(y_vec_cuda.get(), xz_end_vec_cuda.get(), level_xz_vec_cuda.get(), pctc, iAprInfo, giga, iParameters, counter_total, iStream); @@ -660,11 +658,6 @@ public: // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) and copy data from GPU y_vec.resize(iAprInfo.total_number_particles); // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures - std::cout << y_vec.size() << "\n"; - std::cout << iAprInfo.total_number_particles << "\n"; - std::cout << iStream << "\n"; - std::cout << y_vec_cuda.getSize() << std::endl; - std::cout << "----------" << std::endl; checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream)); @@ -691,8 +684,8 @@ public: }; template -GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, int maxLevel) -: impl{new GpuProcessingTaskImpl(image, levels, parameters, maxLevel)} { } +GpuProcessingTask::GpuProcessingTask(const PixelData &image, const APRParameters ¶meters, int maxLevel) +: impl{new GpuProcessingTaskImpl(image, parameters, maxLevel)} { } template GpuProcessingTask::~GpuProcessingTask() { } @@ -834,7 +827,7 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel bool isErrorDetected = false; { ScopedCudaMemHandler isErrorDetectedCuda(&isErrorDetected, 1, aStream); - getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), + getGradientCuda(image, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, bspline_offset, par, aStream); } } diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 5a44de0f..a73c7d61 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -43,7 +43,7 @@ class GpuProcessingTask { public: - GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, int maxLevel); + GpuProcessingTask(const PixelData &image, const APRParameters ¶meters, int maxLevel); ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index ec2eee51..9ba6d598 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -485,12 +485,12 @@ void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) { constantScale<<<1, 1, 0, aStream>>>(image, dim.size()); } -template -void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream) { +template +void runLocalIntensityScalePipeline(const PixelDataDim &tempImageDim, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream) { float var_rescale; std::vector var_win; auto lis = LocalIntensityScale(); - lis.get_window_alt(var_rescale, var_win, par, image); + lis.get_window_alt(var_rescale, var_win, par, tempImageDim); size_t win_y = var_win[0]; size_t win_x = var_win[1]; size_t win_z = var_win[2]; @@ -508,14 +508,14 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete constant_scale = true; } - PixelDataDim imageSize = image.getDimension(); + PixelDataDim imageSize = tempImageDim; if (!constant_scale) { PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension S *ci = cudaImage; S *ct = cudaTemp; - PixelDataDim dim = image.getDimension(); + PixelDataDim dim = tempImageDim; if (par.reflect_bc_lis) { runPaddPixels(cudaImage, lstPadded, imageSize, paddedImageSize, paddSize, aStream); @@ -544,7 +544,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete } } -template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, float*, float*, cudaStream_t); +template void runLocalIntensityScalePipeline(const PixelDataDim &, const APRParameters&, float*, float*, float*, float*, cudaStream_t); @@ -580,6 +580,6 @@ void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRPa lstPadded.initialize(nullptr, paddedImageSize.size(), aStream); lst2Padded.initialize(nullptr, paddedImageSize.size(), aStream); - runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), lstPadded.get(), lst2Padded.get(), aStream); + runLocalIntensityScalePipeline(image.getDimension(), par, cudaImage.get(), cudaTemp.get(), lstPadded.get(), lst2Padded.get(), aStream); } template void getLocalIntensityScale(PixelData&, PixelData&, const APRParameters&); diff --git a/src/algorithm/LocalIntensityScale.cuh b/src/algorithm/LocalIntensityScale.cuh index 4a707d58..f7b4829a 100644 --- a/src/algorithm/LocalIntensityScale.cuh +++ b/src/algorithm/LocalIntensityScale.cuh @@ -4,7 +4,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" -template -void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream); +template +void runLocalIntensityScalePipeline(const PixelDataDim &image, const APRParameters &par, S *cudaImage, S *cudaTemp, S *lstPadded, S *lst2Padded, cudaStream_t aStream); #endif \ No newline at end of file diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index e576efd5..3c3e5f50 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -36,7 +36,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData var_win; - get_window_alt(var_rescale, var_win, par, local_scale_temp); + get_window_alt(var_rescale, var_win, par, local_scale_temp.getDimension()); int win_y = var_win[0]; int win_x = var_win[1]; @@ -165,8 +165,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &var_win, const APRParameters &par); - template - void get_window_alt(float& var_rescale, std::vector& var_win, const APRParameters& par, const PixelData& img); + void get_window_alt(float& var_rescale, std::vector& var_win, const APRParameters& par, const PixelDataDim &img); template void rescale_var(PixelData& var,const float var_rescale); @@ -249,8 +248,7 @@ inline void LocalIntensityScale::get_window(float& var_rescale, std::vector * @param par * @param temp_img (image already allocated to correct size to compute the local intensity scale) */ -template -inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector& var_win, const APRParameters& par,const PixelData& temp_img){ +inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector& var_win, const APRParameters& par,const PixelDataDim &temp_img){ const double rescale_store_3D[6] = {12.8214, 26.1256, 40.2795, 23.3692, 36.2061, 27.0385}; const double rescale_store_2D[6] = {13.2421, 28.7069, 52.0385, 24.4272, 34.9565, 21.1891}; @@ -267,7 +265,7 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< var_win.resize(6,0); - if ( (int) temp_img.y_num > win_val) { + if ( (int) temp_img.y > win_val) { active_y = true; var_win[0] = win_1[psf_ind]; @@ -276,7 +274,7 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< active_y = false; } - if ((int) temp_img.x_num > win_val) { + if ((int) temp_img.x > win_val) { active_x = true; var_win[1] = win_1[psf_ind]; var_win[4] = win_2[psf_ind]; @@ -284,7 +282,7 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< active_x = false; } - if ((int) temp_img.z_num > win_val) { + if ((int) temp_img.z > win_val) { active_z = true; var_win[2] = win_1[psf_ind]; var_win[5] = win_2[psf_ind]; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 3ffda9a2..8ff4cd33 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -303,7 +303,6 @@ namespace { // Initialize GPU data structures to same values as CPU PixelData mGpuImage(input_image, true); - PixelData local_scale_temp_GPU(local_scale_temp, false); // Prepare parameters APRParameters par; @@ -340,7 +339,7 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel); + GpuProcessingTask gpt(mGpuImage, par, maxLevel); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size();