AdaptiveParticles · krzysg · Aug 1, 2022 · Aug 1, 2022 · Aug 1, 2022 · Aug 2, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,7 +6,7 @@ project(APR DESCRIPTION "Adaptive Particle Representation library")
 
 message(STATUS "CMAKE VERSION ${CMAKE_VERSION}")
 
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 if(POLICY CMP0135)
@@ -171,17 +171,17 @@ if(WIN32)
 
 
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 ")
 
     if(CMAKE_COMPILER_IS_GNUCC)
-        set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O4")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic")
         if(NOT WIN32)
             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz")
         endif()
     elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O3")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz")
     endif()
@@ -209,15 +209,29 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 if(APR_USE_CUDA)
     message(STATUS "APR: Building CUDA for APR")
-    set(CMAKE_CUDA_STANDARD 14)
+#    set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
+    set(CMAKE_CUDA_STANDARD 17)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+        set(CMAKE_CUDA_ARCHITECTURES native)
+    endif()
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
     set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA")
+
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling
     set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G")
     if(APR_BENCHMARK)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK")
     endif()
     enable_language(CUDA)
+
+    if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 13.0.0)
+        # For CUDA versions > 13.0.0 keep compatibility with older CUDAs - new CUDA introduce changes that
+        #  impact ELF visibility and linkage for __global__ functions and device variables
+        set(CMAKE_CUDA_FLAGS "--device-entity-has-hidden-visibility=false -static-global-template-stub=false  ${CMAKE_CUDA_FLAGS}")
+    endif ()
+
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPR_USE_CUDA")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPR_USE_CUDA")
     set(APR_CUDA_SOURCE_FILES
@@ -226,6 +240,7 @@ if(APR_USE_CUDA)
             src/algorithm/LocalIntensityScale.cu
             src/algorithm/OVPC.cu
             src/data_structures/APR/access/GPUAccess.cu
+            src/data_structures/APR/access/LinearAccessCuda.cu
             src/numerics/miscCuda.cu
             src/numerics/APRDownsampleGPU.cu
             src/numerics/PixelNumericsGPU.cu
@@ -241,7 +256,9 @@ if(APR_BUILD_STATIC_LIB)
     # generate static library used as a intermediate step in generating fat lib
     set(STATIC_TARGET_NAME staticLib)
     add_library(${STATIC_TARGET_NAME} STATIC $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
-    target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14)
+    set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+    set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_17)
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
     target_include_directories(${STATIC_TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>)
@@ -262,7 +279,8 @@ if(APR_BUILD_SHARED_LIB)
 # generate fat shared library
     set(SHARED_TARGET_NAME sharedLib)
     add_library(${SHARED_TARGET_NAME} SHARED $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
-
+    set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
     target_include_directories(${SHARED_TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>)
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME})

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,5 +1,6 @@
 macro(buildTarget TARGET)
     add_executable(${TARGET} ${TARGET}.cpp)
+    set_property(TARGET ${TARGET} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
     target_link_libraries(${TARGET} ${HDF5_LIBRARIES} ${TIFF_LIBRARIES} ${APR_BUILD_LIBRARY} Threads::Threads ${OPENMP_LINK})
 endmacro(buildTarget)
 
@@ -15,6 +16,11 @@ buildTarget(Example_apr_deconvolution)
 buildTarget(Example_random_access)
 buildTarget(Example_lazy_access)
 
+#APR GPU Tests
+if(APR_USE_CUDA)
+    buildTarget(Example_get_multiapr)
+endif()
+
 #buildTarget(Example_reconstruct_patch)  #The way this is working is going to be re-designed.
 buildTarget(Example_apr_tree)
 

diff --git a/examples/Example_get_apr.cpp b/examples/Example_get_apr.cpp
@@ -48,7 +48,7 @@ int runAPR(cmdLineOptions options) {
     //the apr datastructure
     APR apr;
 
-    APRConverter<float> aprConverter;
+    APRConverter<uint16_t> aprConverter;
 
     //read in the command line options into the parameters file
     aprConverter.par.Ip_th = options.Ip_th;

diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h
@@ -30,7 +30,7 @@ struct cmdLineOptions{
     bool auto_parameters = false;
 
     float Ip_th = 0;
-    float lambda = -1;
+    float lambda = 3.0;
     float sigma_th = 0;
     float rel_error = 0.1;
     float grad_th = 1;

diff --git a/examples/Example_get_multiapr.cpp b/examples/Example_get_multiapr.cpp
@@ -0,0 +1,263 @@
+const char* usage = R"(
+Converts images to APR format: Takes input directory with uint16_t input tiff images and generates the APRs and saves it as hdf5.
+The hdf5 output of this program can be used with the other apr examples, and also viewed with HDFView.
+
+Usage:
+======
+Example_get_multiapr -d input_directory [-od output_directory]
+
+Additional settings (High Level):
+=================================
+-I_th       intensity_threshold (will ignore areas of image below this threshold, useful for removing camera artifacts or auto-fluorescence)
+-sigma_th   lower threshold for the local intensity scale
+-grad_th    ignore areas in the image where the gradient magnitude is lower than this value
+
+Advanced (Direct) Settings:
+===========================
+-lambda lambda_value (directly set the value of the gradient smoothing parameter lambda (reasonable range 0.1-10, default: 3)
+-rel_error rel_error_value (Reasonable ranges are from .08-.15), Default: 0.1
+-neighborhood_optimization_off turns off the neighborhood optimization (This results in boundary Particle Cells also being increased in resolution after the Pulling Scheme step)
+)";
+
+
+#include <iostream>
+#include <filesystem>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "ConfigAPR.h"
+#include "io/APRFile.hpp"
+#include "data_structures/APR/particles/ParticleData.hpp"
+#include "data_structures/APR/APR.hpp"
+#include "algorithm/APRConverter.hpp"
+
+
+struct cmdLineOptions {
+    std::string directory;
+    std::string output_dir;
+
+    float lambda = 3.0;
+    float Ip_th = 0;
+    float grad_th = 1;
+    float sigma_th = 0;
+    float rel_error = 0.1;
+
+    bool neighborhood_optimization = true;
+};
+
+bool command_option_exists(const char **begin, const char **end, const std::string &option)
+{
+    return std::find(begin, end, option) != end;
+}
+
+const char* get_command_option(const char **begin, const char **end, const std::string &option)
+{
+    if (const char** itr = std::find(begin, end, option); itr != end && ++itr != end) {
+        return *itr;
+    }
+    return nullptr;
+}
+
+void printUsage() {
+    std::cerr << "APR version " << ConfigAPR::APR_VERSION << std::endl;
+    std::cerr << usage << std::endl;
+    exit(1);
+}
+
+cmdLineOptions read_command_line_options(const int argc, const char **argv) {
+
+    cmdLineOptions options;
+
+    // --------- Print usage if no args provided
+    if (argc == 1) printUsage();
+
+    // --------- Read params
+
+    // Input Directory
+    if (command_option_exists(argv, argv + argc, "-d")) {
+        options.directory = std::string(get_command_option(argv, argv + argc, "-d"));
+    } else {
+        std::cout << "Input directory required" << std::endl;
+        exit(2);
+    }
+
+    // Output Directory
+    if (command_option_exists(argv, argv + argc, "-od")) {
+        options.output_dir = std::string(get_command_option(argv, argv + argc, "-od"));
+    } else {
+        options.output_dir = options.directory;
+    }
+
+    if (command_option_exists(argv, argv + argc, "-lambda")) {
+        options.lambda = std::stof(std::string(get_command_option(argv, argv + argc, "-lambda")));
+    }
+
+    if (command_option_exists(argv, argv + argc, "-I_th")) {
+        options.Ip_th = std::stof(std::string(get_command_option(argv, argv + argc, "-I_th")));
+    }
+
+    if (command_option_exists(argv, argv + argc, "-grad_th")) {
+        options.grad_th = std::stof(std::string(get_command_option(argv, argv + argc, "-grad_th")));
+    }
+
+    if (command_option_exists(argv, argv + argc, "-sigma_th")) {
+        options.sigma_th = std::stof(std::string(get_command_option(argv, argv + argc, "-sigma_th")));
+    }
+
+    if (command_option_exists(argv, argv + argc, "-rel_error")) {
+        options.rel_error = std::stof(std::string(get_command_option(argv, argv + argc, "-rel_error")));
+    }
+
+    if (command_option_exists(argv, argv + argc, "-neighborhood_optimization_off")) {
+        options.neighborhood_optimization = false;
+    }
+
+    return options;
+}
+
+
+auto getTiffFilesFromDir(const std::string &directory_path) {
+    namespace fs = std::filesystem;
+
+    std::vector<fs::path> tif_files;
+
+    try {
+        for (const auto& entry : fs::directory_iterator(directory_path)) {
+            if (entry.is_regular_file()) {
+                auto ext = entry.path().extension().string();
+                if (ext == ".tif" || ext == ".tiff" || ext == ".TIF" || ext == ".TIFF") {
+                    tif_files.push_back(entry.path());
+                }
+            }
+        }
+    } catch (const fs::filesystem_error& e) {
+        std::cerr << "Filesystem error: " << e.what() << '\n';
+        exit(2);
+    }
+
+    return tif_files;
+}
+
+int runAPR(const cmdLineOptions &options) {
+
+    using ImgType = uint16_t;
+    using ImgContainer = PixelData<ImgType>;
+
+    APRConverter<ImgType> aprConverter;
+
+    // read in the command line options into the parameters file
+    aprConverter.par.input_dir = options.directory;
+    aprConverter.par.output_dir = options.output_dir;
+
+    aprConverter.par.lambda = options.lambda;
+    aprConverter.par.Ip_th = options.Ip_th;
+    aprConverter.par.grad_th = options.grad_th;
+    aprConverter.par.sigma_th = options.sigma_th;
+    aprConverter.par.rel_error = options.rel_error;
+
+    aprConverter.par.neighborhood_optimization = options.neighborhood_optimization;
+
+    // TODO: read here all input files instead of options.input
+    auto tifFiles = getTiffFilesFromDir(options.directory);
+    std::vector<std::unique_ptr<ImgContainer>> input_images;
+    std::vector<ImgContainer *> input_images_raw;
+    std::vector<std::unique_ptr<APR>> APRs;
+    std::vector<APR*> APRs_raw;
+    std::vector<std::unique_ptr<VectorData<ImgType>>> partIntensities;
+    std::vector<VectorData<ImgType> *> partIntensities_raw;
+
+    // Load all images from input directory, check if they have same resolution
+    // Also create APR and intensities objects to be filled by pipeline later
+    int firstOne = true;
+    PixelDataDim sizeOfInput;
+    for (const auto &file : tifFiles) {
+        // Read a file and store it, also keep a vector of raw pointers to read images since this is needed by APRConverter
+        input_images.push_back(std::make_unique<ImgContainer>(TiffUtils::getMesh<ImgType>(file)));
+        input_images_raw.push_back(input_images.back().get());
+        if (firstOne) {
+            firstOne = false;
+            sizeOfInput = input_images.back().get()->getDimension();
+        }
+        else if (input_images.back().get()->getDimension() != sizeOfInput) {
+                std::cerr << "Input images must have the same dimension." << std::endl;
+                exit(2);
+        }
+
+        // We need as many APR objects as input images, and also raw pointer for APRConverter
+        APRs.push_back(std::make_unique<APR>(APR{}));
+        APRs_raw.push_back(APRs.back().get());
+
+        // And same for particle intensities...
+        partIntensities.push_back(std::make_unique<VectorData<ImgType>>(VectorData<ImgType>{}));
+        partIntensities_raw.push_back(partIntensities.back().get());
+    }
+
+    std::cout << std::endl;
+
+    APRTimer timer(true);
+    timer.start_timer("GPU pipeline (mem allocation, processing, sampling) ");
+    if (aprConverter.get_apr_cuda_multistreams(APRs_raw, input_images_raw, partIntensities_raw)) {
+        timer.stop_timer();
+        size_t numOfImages = input_images_raw.size();
+        std::cout << std::endl;
+
+        for (size_t i = 0; i < numOfImages; i++) {
+            std::cout << "Postprocessing " << i+1 << "/" << numOfImages << " image...\n";
+            auto &apr = *APRs[i].get(); // currently process APR
+            auto &particle_intensities = *partIntensities[i].get(); // intensities sampled for current APR
+
+
+            // ------------ TODO: remove me later, this is quick test for Cpu vs Gpu before real test is written
+            // std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl;
+            // std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl;
+            // if (apr.linearAccess.y_vec.size() != particle_intensities.size()) {std::cerr << "CPU vs GPU number of particles differ!" << std::endl;}
+            ParticleData<ImgType> particle_intensities_cpu;
+            particle_intensities_cpu.sample_image(apr, *input_images[i].get()); // sample your particles from your image
+            for (size_t j = 0 ; j < particle_intensities.size(); ++j) {
+                if (particle_intensities_cpu[j]  != particle_intensities[j]) {
+                    std::cout << "Mismatch at " << j << " CPU: " << particle_intensities_cpu[j] << " GPU: " << particle_intensities[j] << std::endl;
+                }
+            }
+            // ---------------------------------------------------------------------------------------------------
+
+            // Output name is like base of input filename + extension ".apr"
+            auto outputDir = std::filesystem::path(options.output_dir);
+            const std::filesystem::path& p(tifFiles[i]);
+            std::string outpuFileName = p.stem().string() + ".apr";
+
+            //write the APR to hdf5 file
+            timer.start_timer("writing output");
+            APRFile aprFile;
+            aprFile.open(outputDir / outpuFileName);
+            aprFile.write_apr(apr, 0, "t", false);
+            ParticleData<ImgType> pd;
+            pd.data = std::move(particle_intensities);
+            aprFile.write_particles("particles",pd);
+            timer.stop_timer();
+
+            // Print some output statistics
+            float aprImageSizeInMB = aprFile.current_file_size_MB();
+            double originalImageSizeInMB = sizeof(ImgType) * static_cast<double>(apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / 1'000'000.0;
+
+            std::cout << "Computational Ratio (Pixels/Particles): " << apr.computational_ratio() << std::endl;
+            std::cout << "Original / APR image size:              " << originalImageSizeInMB << " / " << aprImageSizeInMB <<" MB" << std::endl;
+            std::cout << "Lossy Compression Ratio:                " << originalImageSizeInMB/aprImageSizeInMB << std::endl;
+            std::cout << std::endl;
+        }
+    }
+    else {
+        std::cout << "Oops, something went wrong. APR not computed :(" << std::endl;
+    }
+
+    std::cout << "DONE!\n";
+
+    return 0;
+}
+
+
+int main(const int argc, const char **argv) {
+    const cmdLineOptions options = read_command_line_options(argc, argv);
+    const auto result = runAPR(options);
+
+    return result;
+}