From 2deff2b01ebbdcfea566e8c2c02b4340c55039a2 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 28 Nov 2024 16:00:06 -0500
Subject: [PATCH 01/39] move place_delay files to a directory

---
 vpr/src/place/{ => timing/delay_model}/place_delay_model.cpp | 0
 vpr/src/place/{ => timing/delay_model}/place_delay_model.h   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename vpr/src/place/{ => timing/delay_model}/place_delay_model.cpp (100%)
 rename vpr/src/place/{ => timing/delay_model}/place_delay_model.h (100%)
diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/timing/delay_model/place_delay_model.cpp
similarity index 100%
rename from vpr/src/place/place_delay_model.cpp
rename to vpr/src/place/timing/delay_model/place_delay_model.cpp
diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/timing/delay_model/place_delay_model.h
similarity index 100%
rename from vpr/src/place/place_delay_model.h
rename to vpr/src/place/timing/delay_model/place_delay_model.h

From be8519b26ad6a27af7fc2d6e33d4a6dc5b290b7b Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 28 Nov 2024 16:00:35 -0500
Subject: [PATCH 02/39] remove unused struct frp, timing_place_lookup.cpp

---
 vpr/src/place/timing_place_lookup.cpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index 86dc396e2b8..fa6a9acb0bb 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -40,22 +40,6 @@ constexpr float UNINITIALIZED_DELTA = -1;                                  //Ind
 constexpr float EMPTY_DELTA = -2;                                          //Indicates delta delay from/to an EMPTY block
 constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity(); //Indicates there is no valid delta delay
 
-struct t_profile_loc {
-    t_profile_loc(int x, int y, std::vector<vtr::Point<int>> delta_values)
-        : root(x, y)
-        , deltas(delta_values) {}
-
-    vtr::Point<int> root;
-    std::vector<vtr::Point<int>> deltas;
-};
-
-struct t_profile_info {
-    std::vector<t_profile_loc> locations;
-
-    int max_delta_x;
-    int max_delta_y;
-};
-
 /*** Function Prototypes *****/
 static t_chan_width setup_chan_width(const t_router_opts& router_opts,
                                      t_chan_width_dist chan_width_dist);

From 47dec5c35aa48be5a5234860b0f40809338cc3b7 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 28 Nov 2024 16:17:19 -0500
Subject: [PATCH 03/39] create two files for each placement delay model

---
 .../timing/delay_model/delta_delay_model.cpp  |  34 ++
 .../timing/delay_model/delta_delay_model.h    |  47 +++
 .../delay_model/override_delay_model.cpp      | 262 +++++++++++++++
 .../timing/delay_model/override_delay_model.h | 112 +++++++
 .../timing/delay_model/place_delay_model.cpp  | 309 +-----------------
 .../timing/delay_model/place_delay_model.h    | 180 +---------
 .../timing/delay_model/simple_delay_model.cpp |  45 +++
 .../timing/delay_model/simple_delay_model.h   |  39 +++
 vpr/src/place/timing_place_lookup.cpp         |  56 +---
 9 files changed, 554 insertions(+), 530 deletions(-)
 create mode 100644 vpr/src/place/timing/delay_model/delta_delay_model.cpp
 create mode 100644 vpr/src/place/timing/delay_model/delta_delay_model.h
 create mode 100644 vpr/src/place/timing/delay_model/override_delay_model.cpp
 create mode 100644 vpr/src/place/timing/delay_model/override_delay_model.h
 create mode 100644 vpr/src/place/timing/delay_model/simple_delay_model.cpp
 create mode 100644 vpr/src/place/timing/delay_model/simple_delay_model.h

diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.cpp b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
new file mode 100644
index 00000000000..55bb0104316
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
@@ -0,0 +1,34 @@
+
+#include "delta_delay_model.h"
+
+float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/,
+                             const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
+    int delta_x = std::abs(from_loc.x - to_loc.x);
+    int delta_y = std::abs(from_loc.y - to_loc.y);
+
+    return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
+}
+
+void DeltaDelayModel::dump_echo(std::string filepath) const {
+    FILE* f = vtr::fopen(filepath.c_str(), "w");
+    fprintf(f, "         ");
+    for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) {
+        for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) {
+            fprintf(f, " %9zu", from_layer_num);
+            fprintf(f, "\n");
+            for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
+                fprintf(f, " %9zu", dx);
+            }
+            fprintf(f, "\n");
+            for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) {
+                fprintf(f, "%9zu", dy);
+                for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
+                    fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]);
+                }
+                fprintf(f, "\n");
+            }
+        }
+    }
+    vtr::fclose(f);
+}
+
diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.h b/vpr/src/place/timing/delay_model/delta_delay_model.h
new file mode 100644
index 00000000000..c3ae0d83cf7
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/delta_delay_model.h
@@ -0,0 +1,47 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+
+/**
+ * @class DeltaDelayModel
+ *
+ * @brief A simple delay model based on the distance (delta) between block locations.
+ */
+class DeltaDelayModel : public PlaceDelayModel {
+  public:
+    DeltaDelayModel(float min_cross_layer_delay,
+                    bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    DeltaDelayModel(float min_cross_layer_delay,
+                    vtr::NdMatrix<float, 4> delta_delays,
+                    bool is_flat)
+        : delays_(std::move(delta_delays))
+        , cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    void compute(RouterDelayProfiler& router,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
+
+    void dump_echo(std::string filepath) const override;
+
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
+
+    const vtr::NdMatrix<float, 4>& delays() const {
+        return delays_;
+    }
+
+  private:
+    vtr::NdMatrix<float, 4> delays_; // [0..num_layers-1][0..max_dx][0..max_dy]
+    float cross_layer_delay_;
+
+    /// Indicates whether the router is a two-stage or run-flat
+    bool is_flat_;
+};
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
new file mode 100644
index 00000000000..ceb8245511b
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -0,0 +1,262 @@
+
+#include "override_delay_model.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "place_delay_model.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif  // VTR_ENABLE_CAPNPROTO
+
+const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
+    return base_delay_model_.get();
+}
+
+float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const {
+    // First check to if there is an override delay value
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
+
+    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc);
+    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc);
+
+    t_override override_key;
+    override_key.from_type = from_type_ptr->index;
+    override_key.from_class = from_type_ptr->pin_class[from_pin];
+    override_key.to_type = to_type_ptr->index;
+    override_key.to_class = to_type_ptr->pin_class[to_pin];
+
+    //Delay overrides may be different for +/- delta so do not use
+    //an absolute delta for the look-up
+    override_key.delta_x = to_loc.x - from_loc.x;
+    override_key.delta_y = to_loc.y - from_loc.y;
+
+    float delay_val = std::numeric_limits<float>::quiet_NaN();
+    auto override_iter = delay_overrides_.find(override_key);
+    if (override_iter != delay_overrides_.end()) {
+        //Found an override
+        delay_val = override_iter->second;
+    } else {
+        //Fall back to the base delay model if no override was found
+        delay_val = base_delay_model_->delay(from_loc, from_pin, to_loc, to_pin);
+    }
+
+    return delay_val;
+}
+
+void OverrideDelayModel::set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay_val) {
+    t_override override_key;
+    override_key.from_type = from_type;
+    override_key.from_class = from_class;
+    override_key.to_type = to_type;
+    override_key.to_class = to_class;
+    override_key.delta_x = delta_x;
+    override_key.delta_y = delta_y;
+
+    auto res = delay_overrides_.insert(std::make_pair(override_key, delay_val));
+    if (!res.second) {                 //Key already exists
+        res.first->second = delay_val; //Overwrite existing delay
+    }
+}
+
+void OverrideDelayModel::dump_echo(std::string filepath) const {
+    base_delay_model_->dump_echo(filepath);
+
+    FILE* f = vtr::fopen(filepath.c_str(), "a");
+
+    fprintf(f, "\n");
+    fprintf(f, "# Delay Overrides\n");
+    auto& device_ctx = g_vpr_ctx.device();
+    for (auto kv : delay_overrides_) {
+        auto override_key = kv.first;
+        float delay_val = kv.second;
+        fprintf(f, "from_type: %s to_type: %s from_pin_class: %d to_pin_class: %d delta_x: %d delta_y: %d -> delay: %g\n",
+                device_ctx.physical_tile_types[override_key.from_type].name.c_str(),
+                device_ctx.physical_tile_types[override_key.to_type].name.c_str(),
+                override_key.from_class,
+                override_key.to_class,
+                override_key.delta_x,
+                override_key.delta_y,
+                delay_val);
+    }
+
+    vtr::fclose(f);
+}
+
+float OverrideDelayModel::get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const {
+    t_override key;
+    key.from_type = from_type;
+    key.from_class = from_class;
+    key.to_type = to_type;
+    key.to_class = to_class;
+    key.delta_x = delta_x;
+    key.delta_y = delta_y;
+
+    auto iter = delay_overrides_.find(key);
+    if (iter == delay_overrides_.end()) {
+        VPR_THROW(VPR_ERROR_PLACE, "Key not found.");
+    }
+    return iter->second;
+}
+
+void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
+    base_delay_model_ = std::move(base_delay_model_obj);
+}
+
+/**
+ * When writing capnp targetted serialization, always allow compilation when
+ * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
+ */
+#ifndef VTR_ENABLE_CAPNPROTO
+
+#    define DISABLE_ERROR                              \
+        "is disable because VTR_ENABLE_CAPNPROTO=OFF." \
+        "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
+
+void DeltaDelayModel::read(const std::string& /*file*/) {
+    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::read " DISABLE_ERROR);
+}
+
+void DeltaDelayModel::write(const std::string& /*file*/) const {
+    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::write " DISABLE_ERROR);
+}
+
+void OverrideDelayModel::read(const std::string& /*file*/) {
+    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::read " DISABLE_ERROR);
+}
+
+void OverrideDelayModel::write(const std::string& /*file*/) const {
+    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::write " DISABLE_ERROR);
+}
+
+#else /* VTR_ENABLE_CAPNPROTO */
+
+static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
+    // Getting a scalar field is always "get<field name>()".
+    *out = in.getValue();
+}
+
+static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
+    // Setting a scalar field is always "set<field name>(value)".
+    out->setValue(in);
+}
+
+void DeltaDelayModel::read(const std::string& file) {
+    // MmapFile object creates an mmap of the specified path, and will munmap
+    // when the object leaves scope.
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+
+    // FlatArrayMessageReader is used to read the message from the data array
+    // provided by MmapFile.
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    // When reading capnproto files the Reader object to use is named
+    // <schema name>::Reader.
+    //
+    // Initially this object is an empty VprDeltaDelayModel.
+    VprDeltaDelayModel::Reader model;
+
+    // The reader.getRoot performs a cast from the generic capnproto to fit
+    // with the specified schema.
+    //
+    // Note that capnproto does not validate that the incoming data matches the
+    // schema.  If this property is required, some form of check would be
+    // required.
+    model = reader.getRoot<VprDeltaDelayModel>();
+
+    // ToNdMatrix is a generic function for converting a Matrix capnproto
+    // to a vtr::NdMatrix.
+    //
+    // The use must supply the matrix dimension (2 in this case), the source
+    // capnproto type (VprFloatEntry),
+    // target C++ type (flat), and a function to convert from the source capnproto
+    // type to the target C++ type (ToFloat).
+    //
+    // The second argument should be of type Matrix<X>::Reader where X is the
+    // capnproto element type.
+    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), ToFloat);
+}
+
+void DeltaDelayModel::write(const std::string& file) const {
+    // MallocMessageBuilder object is the generate capnproto message builder,
+    // using malloc for buffer allocation.
+    ::capnp::MallocMessageBuilder builder;
+
+    // initRoot<X> returns a X::Builder object that can be used to set the
+    // fields in the message.
+    auto model = builder.initRoot<VprDeltaDelayModel>();
+
+    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
+    // Matrix message.  It is the mirror function of ToNdMatrix described in
+    // read above.
+    auto delay_values = model.getDelays();
+    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, FromFloat);
+
+    // writeMessageToFile writes message to the specified file.
+    writeMessageToFile(file, &builder);
+}
+
+void OverrideDelayModel::read(const std::string& file) {
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    vtr::NdMatrix<float, 4> delays;
+    auto model = reader.getRoot<VprOverrideDelayModel>();
+    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), ToFloat);
+
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
+
+    // Reading non-scalar capnproto fields is roughly equivilant to using
+    // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
+    auto overrides = model.getDelayOverrides();
+    std::vector<std::pair<t_override, float> > overrides_arr(overrides.size());
+    for (size_t i = 0; i < overrides.size(); ++i) {
+        const auto& elem = overrides[i];
+        overrides_arr[i].first.from_type = elem.getFromType();
+        overrides_arr[i].first.to_type = elem.getToType();
+        overrides_arr[i].first.from_class = elem.getFromClass();
+        overrides_arr[i].first.to_class = elem.getToClass();
+        overrides_arr[i].first.delta_x = elem.getDeltaX();
+        overrides_arr[i].first.delta_y = elem.getDeltaY();
+
+        overrides_arr[i].second = elem.getDelay();
+    }
+
+    delay_overrides_ = vtr::make_flat_map2(std::move(overrides_arr));
+}
+
+void OverrideDelayModel::write(const std::string& file) const {
+    ::capnp::MallocMessageBuilder builder;
+    auto model = builder.initRoot<VprOverrideDelayModel>();
+
+    auto delays = model.getDelays();
+    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), FromFloat);
+
+    // Non-scalar capnproto fields should be first initialized with
+    // init<field  name>(count), and then accessed from the returned
+    // std::vector-like Builder object (specifically capnp::List<X>::Builder).
+    auto overrides = model.initDelayOverrides(delay_overrides_.size());
+    auto dst_iter = overrides.begin();
+    for (const auto& src : delay_overrides_) {
+        auto elem = *dst_iter++;
+        elem.setFromType(src.first.from_type);
+        elem.setToType(src.first.to_type);
+        elem.setFromClass(src.first.from_class);
+        elem.setToClass(src.first.to_class);
+        elem.setDeltaX(src.first.delta_x);
+        elem.setDeltaY(src.first.delta_y);
+
+        elem.setDelay(src.second);
+    }
+
+    writeMessageToFile(file, &builder);
+}
+
+#endif
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.h b/vpr/src/place/timing/delay_model/override_delay_model.h
new file mode 100644
index 00000000000..23f6d01d709
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/override_delay_model.h
@@ -0,0 +1,112 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+#include "delta_delay_model.h"
+
+class OverrideDelayModel : public PlaceDelayModel {
+  public:
+    OverrideDelayModel(float min_cross_layer_delay,
+                       bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    void compute(RouterDelayProfiler& route_profiler,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    /**
+     * @brief returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the
+     * specified from and to pins
+     */
+    float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override;
+
+    void dump_echo(std::string filepath) const override;
+
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
+
+  public: //Mutators
+    void set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model);
+    const DeltaDelayModel* base_delay_model() const;
+    float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const;
+    void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay);
+
+  private:
+    std::unique_ptr<DeltaDelayModel> base_delay_model_;
+    /// Minimum delay of cross-layer connections
+    float cross_layer_delay_;
+
+    /// Indicates whether the router is a two-stage or run-flat
+    bool is_flat_;
+
+    void compute_override_delay_model(RouterDelayProfiler& router,
+                                      const t_router_opts& router_opts);
+
+    /**
+     * @brief Structure that allows delays to be queried from the delay model.
+     *
+     * Delay is calculated given the origin physical tile, the origin
+     * pin, the destination physical tile, and the destination pin.
+     * This structure encapsulates all these information.
+     *
+     *   @param from_type, to_type
+     *              Physical tile index (for easy array access)
+     *   @param from_class, to_class
+     *              The class that the pins belongs to.
+     *   @param to_x, to_y
+     *              The horizontal and vertical displacement
+     *              between two physical tiles.
+     */
+    struct t_override {
+        short from_type;
+        short to_type;
+        short from_class;
+        short to_class;
+        short delta_x;
+        short delta_y;
+
+        /**
+         * @brief Comparison operator designed for performance.
+         *
+         * Operator< is important since t_override serves as the key into the
+         * map structure delay_overrides_. A default comparison operator would
+         * not be inlined by the compiler.
+         *
+         * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
+         * is required for operator< to be inlined by compiler. Proper inlining of
+         * the function reduces place time by around 5%.
+         *
+         * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
+         */
+        friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
+            const short* left = reinterpret_cast<const short*>(&lhs);
+            const short* right = reinterpret_cast<const short*>(&rhs);
+            constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short);
+            return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS);
+        }
+    };
+
+    /**
+     * @brief Map data structure that returns delay values according to
+     *        specific delay model queries.
+     *
+     * Delay model queries are provided by the t_override structure, which
+     * encapsulates the information regarding the origin and the destination.
+     */
+    vtr::flat_map2<t_override, float> delay_overrides_;
+
+    /**
+     * operator< treats memory layout of t_override as an array of short.
+     * This requires all members of t_override are shorts and there is no
+     * padding between members of t_override.
+     */
+    static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
+    static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts");
+};
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.cpp b/vpr/src/place/timing/delay_model/place_delay_model.cpp
index 4f626a5817f..a91547a7e5e 100644
--- a/vpr/src/place/timing/delay_model/place_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/place_delay_model.cpp
@@ -4,319 +4,16 @@
  *        routines related to the placer delay model.
  */
 
-#include <queue>
 #include "place_delay_model.h"
+
+#include <queue>
+
 #include "globals.h"
 #include "router_lookahead_map.h"
-#include "rr_graph2.h"
-
 #include "timing_place_lookup.h"
 #include "placer_state.h"
-
-#include "vtr_log.h"
-#include "vtr_math.h"
 #include "vpr_error.h"
 
-#ifdef VTR_ENABLE_CAPNPROTO
-#    include "capnp/serialize.h"
-#    include "place_delay_model.capnp.h"
-#    include "ndmatrix_serdes.h"
-#    include "mmap_file.h"
-#    include "serdes_utils.h"
-#endif /* VTR_ENABLE_CAPNPROTO */
-
-///@brief DeltaDelayModel methods.
-float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
-    int delta_x = std::abs(from_loc.x - to_loc.x);
-    int delta_y = std::abs(from_loc.y - to_loc.y);
-
-    return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
-}
-
-void DeltaDelayModel::dump_echo(std::string filepath) const {
-    FILE* f = vtr::fopen(filepath.c_str(), "w");
-    fprintf(f, "         ");
-    for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) {
-        for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) {
-            fprintf(f, " %9zu", from_layer_num);
-            fprintf(f, "\n");
-            for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
-                fprintf(f, " %9zu", dx);
-            }
-            fprintf(f, "\n");
-            for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) {
-                fprintf(f, "%9zu", dy);
-                for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
-                    fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]);
-                }
-                fprintf(f, "\n");
-            }
-        }
-    }
-    vtr::fclose(f);
-}
-
-const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
-    return base_delay_model_.get();
-}
-
-///@brief OverrideDelayModel methods.
-float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const {
-    //First check to if there is an override delay value
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-
-    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc);
-    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc);
-
-    t_override override_key;
-    override_key.from_type = from_type_ptr->index;
-    override_key.from_class = from_type_ptr->pin_class[from_pin];
-    override_key.to_type = to_type_ptr->index;
-    override_key.to_class = to_type_ptr->pin_class[to_pin];
-
-    //Delay overrides may be different for +/- delta so do not use
-    //an absolute delta for the look-up
-    override_key.delta_x = to_loc.x - from_loc.x;
-    override_key.delta_y = to_loc.y - from_loc.y;
-
-    float delay_val = std::numeric_limits<float>::quiet_NaN();
-    auto override_iter = delay_overrides_.find(override_key);
-    if (override_iter != delay_overrides_.end()) {
-        //Found an override
-        delay_val = override_iter->second;
-    } else {
-        //Fall back to the base delay model if no override was found
-        delay_val = base_delay_model_->delay(from_loc, from_pin, to_loc, to_pin);
-    }
-
-    return delay_val;
-}
-
-void OverrideDelayModel::set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay_val) {
-    t_override override_key;
-    override_key.from_type = from_type;
-    override_key.from_class = from_class;
-    override_key.to_type = to_type;
-    override_key.to_class = to_class;
-    override_key.delta_x = delta_x;
-    override_key.delta_y = delta_y;
-
-    auto res = delay_overrides_.insert(std::make_pair(override_key, delay_val));
-    if (!res.second) {                 //Key already exists
-        res.first->second = delay_val; //Overwrite existing delay
-    }
-}
-
-void OverrideDelayModel::dump_echo(std::string filepath) const {
-    base_delay_model_->dump_echo(filepath);
-
-    FILE* f = vtr::fopen(filepath.c_str(), "a");
-
-    fprintf(f, "\n");
-    fprintf(f, "# Delay Overrides\n");
-    auto& device_ctx = g_vpr_ctx.device();
-    for (auto kv : delay_overrides_) {
-        auto override_key = kv.first;
-        float delay_val = kv.second;
-        fprintf(f, "from_type: %s to_type: %s from_pin_class: %d to_pin_class: %d delta_x: %d delta_y: %d -> delay: %g\n",
-                device_ctx.physical_tile_types[override_key.from_type].name.c_str(),
-                device_ctx.physical_tile_types[override_key.to_type].name.c_str(),
-                override_key.from_class,
-                override_key.to_class,
-                override_key.delta_x,
-                override_key.delta_y,
-                delay_val);
-    }
-
-    vtr::fclose(f);
-}
-
-float OverrideDelayModel::get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const {
-    t_override key;
-    key.from_type = from_type;
-    key.from_class = from_class;
-    key.to_type = to_type;
-    key.to_class = to_class;
-    key.delta_x = delta_x;
-    key.delta_y = delta_y;
-
-    auto iter = delay_overrides_.find(key);
-    if (iter == delay_overrides_.end()) {
-        VPR_THROW(VPR_ERROR_PLACE, "Key not found.");
-    }
-    return iter->second;
-}
-
-void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
-    base_delay_model_ = std::move(base_delay_model_obj);
-}
-
-float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
-    int delta_x = std::abs(from_loc.x - to_loc.x);
-    int delta_y = std::abs(from_loc.y - to_loc.y);
-
-    int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index;
-    return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
-}
-
-/**
- * When writing capnp targetted serialization, always allow compilation when
- * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
- */
-#ifndef VTR_ENABLE_CAPNPROTO
-
-#    define DISABLE_ERROR                              \
-        "is disable because VTR_ENABLE_CAPNPROTO=OFF." \
-        "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
-
-void DeltaDelayModel::read(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::read " DISABLE_ERROR);
-}
-
-void DeltaDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::write " DISABLE_ERROR);
-}
-
-void OverrideDelayModel::read(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::read " DISABLE_ERROR);
-}
-
-void OverrideDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::write " DISABLE_ERROR);
-}
-
-#else /* VTR_ENABLE_CAPNPROTO */
-
-static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
-    // Getting a scalar field is always "get<field name>()".
-    *out = in.getValue();
-}
-
-static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
-    // Setting a scalar field is always "set<field name>(value)".
-    out->setValue(in);
-}
-
-void DeltaDelayModel::read(const std::string& file) {
-    // MmapFile object creates an mmap of the specified path, and will munmap
-    // when the object leaves scope.
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-
-    // FlatArrayMessageReader is used to read the message from the data array
-    // provided by MmapFile.
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    // When reading capnproto files the Reader object to use is named
-    // <schema name>::Reader.
-    //
-    // Initially this object is an empty VprDeltaDelayModel.
-    VprDeltaDelayModel::Reader model;
-
-    // The reader.getRoot performs a cast from the generic capnproto to fit
-    // with the specified schema.
-    //
-    // Note that capnproto does not validate that the incoming data matches the
-    // schema.  If this property is required, some form of check would be
-    // required.
-    model = reader.getRoot<VprDeltaDelayModel>();
-
-    // ToNdMatrix is a generic function for converting a Matrix capnproto
-    // to a vtr::NdMatrix.
-    //
-    // The use must supply the matrix dimension (2 in this case), the source
-    // capnproto type (VprFloatEntry),
-    // target C++ type (flat), and a function to convert from the source capnproto
-    // type to the target C++ type (ToFloat).
-    //
-    // The second argument should be of type Matrix<X>::Reader where X is the
-    // capnproto element type.
-    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), ToFloat);
-}
-
-void DeltaDelayModel::write(const std::string& file) const {
-    // MallocMessageBuilder object is the generate capnproto message builder,
-    // using malloc for buffer allocation.
-    ::capnp::MallocMessageBuilder builder;
-
-    // initRoot<X> returns a X::Builder object that can be used to set the
-    // fields in the message.
-    auto model = builder.initRoot<VprDeltaDelayModel>();
-
-    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
-    // Matrix message.  It is the mirror function of ToNdMatrix described in
-    // read above.
-    auto delay_values = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, FromFloat);
-
-    // writeMessageToFile writes message to the specified file.
-    writeMessageToFile(file, &builder);
-}
-
-void OverrideDelayModel::read(const std::string& file) {
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    vtr::NdMatrix<float, 4> delays;
-    auto model = reader.getRoot<VprOverrideDelayModel>();
-    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), ToFloat);
-
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
-
-    // Reading non-scalar capnproto fields is roughly equivilant to using
-    // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
-    auto overrides = model.getDelayOverrides();
-    std::vector<std::pair<t_override, float> > overrides_arr(overrides.size());
-    for (size_t i = 0; i < overrides.size(); ++i) {
-        const auto& elem = overrides[i];
-        overrides_arr[i].first.from_type = elem.getFromType();
-        overrides_arr[i].first.to_type = elem.getToType();
-        overrides_arr[i].first.from_class = elem.getFromClass();
-        overrides_arr[i].first.to_class = elem.getToClass();
-        overrides_arr[i].first.delta_x = elem.getDeltaX();
-        overrides_arr[i].first.delta_y = elem.getDeltaY();
-
-        overrides_arr[i].second = elem.getDelay();
-    }
-
-    delay_overrides_ = vtr::make_flat_map2(std::move(overrides_arr));
-}
-
-void OverrideDelayModel::write(const std::string& file) const {
-    ::capnp::MallocMessageBuilder builder;
-    auto model = builder.initRoot<VprOverrideDelayModel>();
-
-    auto delays = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), FromFloat);
-
-    // Non-scalar capnproto fields should be first initialized with
-    // init<field  name>(count), and then accessed from the returned
-    // std::vector-like Builder object (specifically capnp::List<X>::Builder).
-    auto overrides = model.initDelayOverrides(delay_overrides_.size());
-    auto dst_iter = overrides.begin();
-    for (const auto& src : delay_overrides_) {
-        auto elem = *dst_iter++;
-        elem.setFromType(src.first.from_type);
-        elem.setToType(src.first.to_type);
-        elem.setFromClass(src.first.from_class);
-        elem.setToClass(src.first.to_class);
-        elem.setDeltaX(src.first.delta_x);
-        elem.setDeltaY(src.first.delta_y);
-
-        elem.setDelay(src.second);
-    }
-
-    writeMessageToFile(file, &builder);
-}
-
-#endif
-
 ///@brief Initialize the placer delay model.
 std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>& net_list,
                                                                t_chan_width_dist chan_width_dist,
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.h b/vpr/src/place/timing/delay_model/place_delay_model.h
index 0aa01385e6e..e361f8cc197 100644
--- a/vpr/src/place/timing/delay_model/place_delay_model.h
+++ b/vpr/src/place/timing/delay_model/place_delay_model.h
@@ -5,6 +5,7 @@
  */
 
 #pragma once
+
 #include "vtr_ndmatrix.h"
 #include "vtr_flat_map.h"
 #include "vpr_types.h"
@@ -54,11 +55,10 @@ class PlaceDelayModel {
     virtual ~PlaceDelayModel() = default;
 
     ///@brief Computes place delay model.
-    virtual void compute(
-        RouterDelayProfiler& route_profiler,
-        const t_placer_opts& placer_opts,
-        const t_router_opts& router_opts,
-        int longest_length)
+    virtual void compute(RouterDelayProfiler& route_profiler,
+                         const t_placer_opts& placer_opts,
+                         const t_router_opts& router_opts,
+                         int longest_length)
         = 0;
 
     /**
@@ -86,175 +86,5 @@ class PlaceDelayModel {
     virtual void read(const std::string& file) = 0;
 };
 
-///@brief A simple delay model based on the distance (delta) between block locations.
-class DeltaDelayModel : public PlaceDelayModel {
-  public:
-    DeltaDelayModel(float min_cross_layer_delay,
-                    bool is_flat)
-        : cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-    DeltaDelayModel(float min_cross_layer_delay,
-                    vtr::NdMatrix<float, 4> delta_delays,
-                    bool is_flat)
-        : delays_(std::move(delta_delays))
-        , cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-
-    void compute(
-        RouterDelayProfiler& router,
-        const t_placer_opts& placer_opts,
-        const t_router_opts& router_opts,
-        int longest_length) override;
-    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
-    void dump_echo(std::string filepath) const override;
-
-    void read(const std::string& file) override;
-    void write(const std::string& file) const override;
-    const vtr::NdMatrix<float, 4>& delays() const {
-        return delays_;
-    }
-
-  private:
-    vtr::NdMatrix<float, 4> delays_; // [0..num_layers-1][0..max_dx][0..max_dy]
-    float cross_layer_delay_;
-    /**
-     * @brief Indicates whether the router is a two-stage or run-flat
-     */
-    bool is_flat_;
-};
-
-class OverrideDelayModel : public PlaceDelayModel {
-  public:
-    OverrideDelayModel(float min_cross_layer_delay,
-                       bool is_flat)
-        : cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-    void compute(
-        RouterDelayProfiler& route_profiler,
-        const t_placer_opts& placer_opts,
-        const t_router_opts& router_opts,
-        int longest_length) override;
-    // returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the
-    // specified from and to pins
-    float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override;
-    void dump_echo(std::string filepath) const override;
-
-    void read(const std::string& file) override;
-    void write(const std::string& file) const override;
-
-  public: //Mutators
-    void set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model);
-    const DeltaDelayModel* base_delay_model() const;
-    float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const;
-    void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay);
-
-  private:
-    std::unique_ptr<DeltaDelayModel> base_delay_model_;
-    /**
-     * @brief Minimum delay of cross-layer connections
-     */
-    float cross_layer_delay_;
-    /**
-     * @brief Indicates whether the router is a two-stage or run-flat
-     */
-    bool is_flat_;
 
-    void compute_override_delay_model(RouterDelayProfiler& router,
-                                      const t_router_opts& router_opts);
 
-    /**
-     * @brief Structure that allows delays to be queried from the delay model.
-     *
-     * Delay is calculated given the origin physical tile, the origin
-     * pin, the destination physical tile, and the destination pin.
-     * This structure encapsulates all these information.
-     *
-     *   @param from_type, to_type
-     *              Physical tile index (for easy array access)
-     *   @param from_class, to_class
-     *              The class that the pins belongs to.
-     *   @param to_x, to_y
-     *              The horizontal and vertical displacement
-     *              between two physical tiles.
-     */
-    struct t_override {
-        short from_type;
-        short to_type;
-        short from_class;
-        short to_class;
-        short delta_x;
-        short delta_y;
-
-        /**
-         * @brief Comparison operator designed for performance.
-         *
-         * Operator< is important since t_override serves as the key into the
-         * map structure delay_overrides_. A default comparison operator would
-         * not be inlined by the compiler.
-         *
-         * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
-         * is required for operator< to be inlined by compiler. Proper inlining of
-         * the function reduces place time by around 5%.
-         *
-         * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
-         */
-        friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
-            const short* left = reinterpret_cast<const short*>(&lhs);
-            const short* right = reinterpret_cast<const short*>(&rhs);
-            constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short);
-            return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS);
-        }
-    };
-
-    /**
-     * @brief Map data structure that returns delay values according to
-     *        specific delay model queries.
-     *
-     * Delay model queries are provided by the t_override structure, which
-     * encapsulates the information regarding the origin and the destination.
-     */
-    vtr::flat_map2<t_override, float> delay_overrides_;
-
-    /**
-     * operator< treats memory layout of t_override as an array of short.
-     * This requires all members of t_override are shorts and there is no
-     * padding between members of t_override.
-     */
-    static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
-    static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts");
-};
-
-///@brief A simple delay model based on the information stored in router lookahead
-///  This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router
-class SimpleDelayModel : public PlaceDelayModel {
-  public:
-    SimpleDelayModel() {}
-
-    void compute(
-        RouterDelayProfiler& router,
-        const t_placer_opts& placer_opts,
-        const t_router_opts& router_opts,
-        int longest_length) override;
-    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
-    void dump_echo(std::string /*filepath*/) const override {}
-
-    void read(const std::string& /*file*/) override {}
-    void write(const std::string& /*file*/) const override {}
-
-  private:
-    /**
-     * @brief The matrix to store the minimum delay between different points on different layers.
-     *
-     *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers
-     *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs
-     *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers
-     *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1.
-     *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular.
-     */
-    vtr::NdMatrix<float, 5> delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy]
-};
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.cpp b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
new file mode 100644
index 00000000000..0031d9eb1fe
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
@@ -0,0 +1,45 @@
+
+#include "simple_delay_model.h"
+
+
+void SimpleDelayModel::compute(RouterDelayProfiler& route_profiler,
+                               const t_placer_opts& /*placer_opts*/,
+                               const t_router_opts& /*router_opts*/,
+                               int /*longest_length*/) {
+    const auto& grid = g_vpr_ctx.device().grid;
+    const size_t num_physical_tile_types = g_vpr_ctx.device().physical_tile_types.size();
+    const size_t num_layers = grid.get_num_layers();
+
+    // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height]
+    // The second index related to the layer that the source location is on and the third index is for the sink layer
+    delays_ = vtr::NdMatrix<float, 5>({num_physical_tile_types,
+                                       num_layers,
+                                       num_layers,
+                                       grid.width(),
+                                       grid.height()});
+
+    for (size_t physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) {
+        for (size_t from_layer = 0; from_layer < num_layers; ++from_layer) {
+            for (size_t to_layer = 0; to_layer < num_layers; ++to_layer) {
+                for (size_t dx = 0; dx < grid.width(); ++dx) {
+                    for (size_t dy = 0; dy < grid.height(); ++dy) {
+                        float min_delay = route_profiler.get_min_delay(physical_tile_type_idx,
+                                                                       from_layer,
+                                                                       to_layer,
+                                                                       dx,
+                                                                       dy);
+                        delays_[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay;
+                    }
+                }
+            }
+        }
+    }
+}
+
+float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
+    int delta_x = std::abs(from_loc.x - to_loc.x);
+    int delta_y = std::abs(from_loc.y - to_loc.y);
+
+    int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index;
+    return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
+}
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.h b/vpr/src/place/timing/delay_model/simple_delay_model.h
new file mode 100644
index 00000000000..f5a856688cd
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/simple_delay_model.h
@@ -0,0 +1,39 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+
+/**
+ * @class SimpleDelayModel
+ * @brief A simple delay model based on the information stored in router lookahead
+ * This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router
+ */
+class SimpleDelayModel : public PlaceDelayModel {
+  public:
+    SimpleDelayModel() {}
+
+    /// @brief Use the information in the router lookahead to fill the delay matrix instead of running the router
+    void compute(RouterDelayProfiler& router,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
+
+    void dump_echo(std::string /*filepath*/) const override {}
+
+    void read(const std::string& /*file*/) override {}
+    void write(const std::string& /*file*/) const override {}
+
+  private:
+    /**
+     * @brief The matrix to store the minimum delay between different points on different layers.
+     *
+     *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers
+     *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs
+     *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers
+     *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1.
+     *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular.
+     */
+    vtr::NdMatrix<float, 5> delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy]
+};
\ No newline at end of file
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index fa6a9acb0bb..76b06bbc55b 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -25,6 +25,9 @@
 #include "route_profiling.h"
 #include "router_delay_profiling.h"
 #include "place_delay_model.h"
+#include "simple_delay_model.h"
+#include "delta_delay_model.h"
+#include "override_delay_model.h"
 
 /*To compute delay between blocks we calculate the delay between */
 /*different nodes in the FPGA.  From this procedure we generate
@@ -123,13 +126,6 @@ static vtr::NdMatrix<float, 4> compute_delta_delay_model(
     int longest_length,
     bool is_flat);
 
-/**
- * @brief Use the information in the router lookahead to fill the delay matrix instead of running the router
- * @param route_profiler
- * @return The delay matrix that contain the minimum cost between two locations
- */
-static vtr::NdMatrix<float, 5> compute_simple_delay_model(RouterDelayProfiler& route_profiler);
-
 static bool find_direct_connect_sample_locations(const t_direct_inf* direct,
                                                  t_physical_tile_type_ptr from_type,
                                                  int from_pin,
@@ -209,11 +205,10 @@ std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts&
     return place_delay_model;
 }
 
-void DeltaDelayModel::compute(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& placer_opts,
-    const t_router_opts& router_opts,
-    int longest_length) {
+void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
+                              const t_placer_opts& placer_opts,
+                              const t_router_opts& router_opts,
+                              int longest_length) {
     delays_ = compute_delta_delay_model(
         route_profiler,
         placer_opts, router_opts, /*measure_directconnect=*/true,
@@ -237,14 +232,6 @@ void OverrideDelayModel::compute(
     compute_override_delay_model(route_profiler, router_opts);
 }
 
-void SimpleDelayModel::compute(
-    RouterDelayProfiler& router,
-    const t_placer_opts& /*placer_opts*/,
-    const t_router_opts& /*router_opts*/,
-    int /*longest_length*/) {
-    delays_ = compute_simple_delay_model(router);
-}
-
 /******* File Accessible Functions **********/
 
 std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
@@ -1028,36 +1015,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delay_model(
     return delta_delays;
 }
 
-static vtr::NdMatrix<float, 5> compute_simple_delay_model(RouterDelayProfiler& route_profiler) {
-    const auto& grid = g_vpr_ctx.device().grid;
-    int num_physical_tile_types = static_cast<int>(g_vpr_ctx.device().physical_tile_types.size());
-    // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height]
-    // The second index related to the layer that the source location is on and the third index is for the sink layer
-    vtr::NdMatrix<float, 5> delta_delays({static_cast<unsigned long>(num_physical_tile_types),
-                                          static_cast<unsigned long>(grid.get_num_layers()),
-                                          static_cast<unsigned long>(grid.get_num_layers()),
-                                          grid.width(),
-                                          grid.height()});
-
-    for (int physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) {
-        for (int from_layer = 0; from_layer < grid.get_num_layers(); ++from_layer) {
-            for (int to_layer = 0; to_layer < grid.get_num_layers(); ++to_layer) {
-                for (int dx = 0; dx < static_cast<int>(grid.width()); ++dx) {
-                    for (int dy = 0; dy < static_cast<int>(grid.height()); ++dy) {
-                        float min_delay = route_profiler.get_min_delay(physical_tile_type_idx,
-                                                                       from_layer,
-                                                                       to_layer,
-                                                                       dx,
-                                                                       dy);
-                        delta_delays[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay;
-                    }
-                }
-            }
-        }
-    }
 
-    return delta_delays;
-}
 
 //Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
 static bool find_direct_connect_sample_locations(const t_direct_inf* direct,

From bf02e65dfa68bfa9e04f77a9aae37436a30af585 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 12:23:13 -0500
Subject: [PATCH 04/39] add compute_delta_delays_utils files

---
 .../compute_delta_delays_utils.cpp            |  889 ++++++++++++++
 .../delay_model/compute_delta_delays_utils.h  |   27 +
 .../timing/delay_model/delta_delay_model.cpp  |   14 +
 .../delay_model/override_delay_model.cpp      |   95 ++
 vpr/src/place/timing_place_lookup.cpp         | 1024 +----------------
 5 files changed, 1026 insertions(+), 1023 deletions(-)
 create mode 100644 vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
 create mode 100644 vpr/src/place/timing/delay_model/compute_delta_delays_utils.h

diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
new file mode 100644
index 00000000000..78855a251b6
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -0,0 +1,889 @@
+
+#include "compute_delta_delays_utils.h"
+
+#include "vtr_time.h"
+#include "vtr_math.h"
+#include "physical_types.h"
+#include "globals.h"
+#include "timing_place_lookup.h"
+
+/// Indicates the delta delay value has not been calculated
+static constexpr float UNINITIALIZED_DELTA = -1;
+/// Indicates delta delay from/to an EMPTY block
+static constexpr float EMPTY_DELTA = -2;
+/// Indicates there is no valid delta delay
+static constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity();
+
+static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
+                                                    const t_placer_opts& palcer_opts,
+                                                    const t_router_opts& router_opts,
+                                                    bool measure_directconnect,
+                                                    size_t longest_length,
+                                                    bool is_flat);
+
+static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
+
+static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
+
+static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
+
+static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays);
+
+static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
+                                                   vtr::Matrix<std::vector<float>>& matrix,
+                                                   int from_layer_num,
+                                                   int to_layer_num,
+                                                   int source_x,
+                                                   int source_y,
+                                                   int start_x,
+                                                   int start_y,
+                                                   int end_x,
+                                                   int end_y,
+                                                   const t_router_opts& router_opts,
+                                                   bool measure_directconnect,
+                                                   const std::set<std::string>& allowed_types,
+                                                   bool /*is_flat*/);
+
+static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& route_profiler,
+                                                      vtr::Matrix<std::vector<float>>& matrix,
+                                                      int from_layer_num,
+                                                      int to_layer_num,
+                                                      int source_x,
+                                                      int source_y,
+                                                      int start_x,
+                                                      int start_y,
+                                                      int end_x,
+                                                      int end_y,
+                                                      const t_router_opts& router_opts,
+                                                      bool measure_directconnect,
+                                                      const std::set<std::string>& allowed_types,
+                                                      bool is_flat);
+
+static float route_connection_delay(RouterDelayProfiler& route_profiler,
+                                    int from_layer_num,
+                                    int to_layer_num,
+                                    int source_x_loc,
+                                    int source_y_loc,
+                                    int sink_x_loc,
+                                    int sink_y_loc,
+                                    const t_router_opts& router_opts,
+                                    bool measure_directconnect);
+
+float delay_reduce(std::vector<float>& delays, e_reducer reducer);
+
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>* matrix,
+                                int delta_x,
+                                int delta_y,
+                                float delay);
+
+static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
+                                      int from_layer,
+                                      t_physical_tile_loc to_tile_loc,
+                                      int max_distance);
+
+/***************************************************************************************/
+
+static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
+                                                    const t_placer_opts& placer_opts,
+                                                    const t_router_opts& router_opts,
+                                                    bool measure_directconnect,
+                                                    size_t longest_length,
+                                                    bool is_flat) {
+    /* To avoid edge effects we place the source at least 'longest_length' away
+     * from the device edge and route from there for all possible delta values < dimension
+     */
+
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
+
+    const size_t num_layers = grid.get_num_layers();
+    const size_t device_width = grid.width();
+    const size_t device_height = grid.height();
+
+    vtr::NdMatrix<float, 4> delta_delays({num_layers, num_layers, device_width, device_height});
+
+    for (int from_layer_num = 0; from_layer_num < (int)num_layers; from_layer_num++) {
+        for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) {
+            vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({device_width, device_height});
+
+            size_t mid_x = vtr::nint(device_width / 2);
+            size_t mid_y = vtr::nint(device_height / 2);
+
+            size_t low_x = std::min(longest_length, mid_x);
+            size_t low_y = std::min(longest_length, mid_y);
+            size_t high_x = mid_x;
+            size_t high_y = mid_y;
+            if (longest_length <= device_width) {
+                high_x = std::max(device_width - longest_length, mid_x);
+            }
+            if (longest_length <= device_height) {
+                high_y = std::max(device_width - longest_length, mid_y);
+            }
+
+            std::set<std::string> allowed_types;
+            if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
+                auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
+                for (const auto& type : allowed_types_vector) {
+                    allowed_types.insert(type);
+                }
+            }
+
+            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+            //   +                 |                       |               +
+            //   +        A        |           B           |       C       +
+            //   +                 |                       |               +
+            //   +-----------------\-----------------------.---------------+
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +        D        |           E           |       F       +
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +                 |                       |               +
+            //   +-----------------*-----------------------/---------------+
+            //   +                 |                       |               +
+            //   +        G        |           H           |       I       +
+            //   +                 |                       |               +
+            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+            //
+            //   * = (low_x, low_y)
+            //   . = (high_x, high_y)
+            //   / = (high_x, low_y)
+            //   \ = (low_x, high_y)
+            //   + = device edge
+
+            //Find the lowest y location on the left edge with a non-empty block
+            int y = 0;
+            int x = 0;
+            t_physical_tile_type_ptr src_type = nullptr;
+            for (x = 0; x < (int)grid.width(); ++x) {
+                for (y = 0; y < (int)grid.height(); ++y) {
+                    auto type = grid.get_physical_type({x, y, from_layer_num});
+
+                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
+                            continue;
+                        }
+                        src_type = type;
+                        break;
+                    }
+                }
+                if (src_type) {
+                    break;
+                }
+            }
+            VTR_ASSERT(src_type != nullptr);
+
+            auto generic_compute_matrix = (placer_opts.place_delta_delay_matrix_calculation_method == e_place_delta_delay_algorithm::ASTAR_ROUTE) ? generic_compute_matrix_iterative_astar : generic_compute_matrix_dijkstra_expansion;
+
+#ifdef VERBOSE
+            VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y);
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   x, y,
+                                   x, y,
+                                   grid.width() - 1, grid.height() - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Find the lowest x location on the bottom edge with a non-empty block
+            src_type = nullptr;
+            for (y = 0; y < (int)grid.height(); ++y) {
+                for (x = 0; x < (int)grid.width(); ++x) {
+                    auto type = grid.get_physical_type({x, y, from_layer_num});
+
+                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
+                            continue;
+                        }
+                        src_type = type;
+                        break;
+                    }
+                }
+                if (src_type) {
+                    break;
+                }
+            }
+            VTR_ASSERT(src_type != nullptr);
+#ifdef VERBOSE
+            VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y);
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   x, y,
+                                   x, y,
+                                   grid.width() - 1, grid.height() - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions B, C, E, F
+#ifdef VERBOSE
+            VTR_LOG("Computing from low/low:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   low_x, low_y,
+                                   low_x, low_y,
+                                   grid.width() - 1, grid.height() - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions D, E, G, H
+#ifdef VERBOSE
+            VTR_LOG("Computing from high/high:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   high_x, high_y,
+                                   0, 0,
+                                   high_x, high_y,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions A, B, D, E
+#ifdef VERBOSE
+            VTR_LOG("Computing from high/low:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   high_x, low_y,
+                                   0, low_y,
+                                   high_x, grid.height() - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions E, F, H, I
+#ifdef VERBOSE
+            VTR_LOG("Computing from low/high:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   low_x, high_y,
+                                   low_x, 0,
+                                   grid.width() - 1, high_y,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+            for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) {
+                for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) {
+                    delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer);
+                }
+            }
+        }
+    }
+
+    return delta_delays;
+}
+
+static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
+    // Set any empty delta's to the average of it's neighbours
+
+    for (size_t from_layer_num = 0; from_layer_num < delta_delays.dim_size(0); ++from_layer_num) {
+        for (size_t to_layer_num = 0; to_layer_num < delta_delays.dim_size(1); ++to_layer_num) {
+            for (size_t delta_x = 0; delta_x < delta_delays.dim_size(2); ++delta_x) {
+                for (size_t delta_y = 0; delta_y < delta_delays.dim_size(3); ++delta_y) {
+                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == UNINITIALIZED_DELTA) {
+                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = IMPOSSIBLE_DELTA;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
+    // Set any empty delta's to the average of it's neighbours
+    //
+    // Empty coordinates may occur if the sampling location happens to not have
+    // a connection at that location.  However a more through sampling likely
+    // would return a result, so we fill in the empty holes with a small
+    // neighbour average.
+    constexpr int kMaxAverageDistance = 2;
+    for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) {
+        for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) {
+            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
+                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
+                    if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) {
+                        delta_delays[from_layer][to_layer][delta_x][delta_y] =
+                            find_neighboring_average(delta_delays,
+                                                     from_layer,
+                                                     {delta_x, delta_y, to_layer},
+                                                     kMaxAverageDistance);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
+    // Set any impossible delta's to the average of its neighbours
+    //
+    // Impossible coordinates may occur if an IPIN cannot be reached from the
+    // sampling OPIN.  This might occur if the IPIN or OPIN used for sampling
+    // is specialized, and therefore cannot be reached via the by the pins
+    // sampled.  Leaving this value in the delay matrix will result in invalid
+    // slacks if the delay matrix uses this value.
+    //
+    // A max average distance of 5 is used to provide increased effort in
+    // filling these gaps.  It is more important to have a poor predication,
+    // than an invalid value and causing a slack assertion.
+    constexpr int kMaxAverageDistance = 5;
+    for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) {
+        for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) {
+            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
+                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
+                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) {
+                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average(
+                            delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& grid = device_ctx.grid;
+
+    for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) {
+        for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) {
+            for (size_t x = 0; x < grid.width(); ++x) {
+                for (size_t y = 0; y < grid.height(); ++y) {
+                    float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y];
+
+                    if (delta_delay < 0.) {
+                        VPR_ERROR(VPR_ERROR_PLACE,
+                                  "Found invaild negative delay %g for delta [%d,%d,%d,%d]",
+                                  delta_delay, from_layer_num, to_layer_num, x, y);
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
+                                                   vtr::Matrix<std::vector<float>>& matrix,
+                                                   int from_layer_num,
+                                                   int to_layer_num,
+                                                   int source_x,
+                                                   int source_y,
+                                                   int start_x,
+                                                   int start_y,
+                                                   int end_x,
+                                                   int end_y,
+                                                   const t_router_opts& router_opts,
+                                                   bool measure_directconnect,
+                                                   const std::set<std::string>& allowed_types,
+                                                   bool /*is_flat*/) {
+    //vtr::ScopedStartFinishTimer t(vtr::string_fmt("Profiling from (%d,%d)", source_x, source_y));
+
+    int delta_x, delta_y;
+    int sink_x, sink_y;
+
+    auto& device_ctx = g_vpr_ctx.device();
+
+    for (sink_x = start_x; sink_x <= end_x; sink_x++) {
+        for (sink_y = start_y; sink_y <= end_y; sink_y++) {
+            delta_x = abs(sink_x - source_x);
+            delta_y = abs(sink_y - source_y);
+
+            t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
+            t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
+
+            bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE
+                                        || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE);
+
+            bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
+
+            if (src_or_target_empty || !is_allowed_type) {
+                if (matrix[delta_x][delta_y].empty()) {
+                    //Only set empty target if we don't already have a valid delta delay
+                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                            "EMPTY",
+                            delta_x, delta_y,
+                            source_x, source_y,
+                            sink_x, sink_y);
+#endif
+                }
+            } else {
+                //Valid start/end
+
+                float delay = route_connection_delay(route_profiler,
+                                                     from_layer_num,
+                                                     to_layer_num,
+                                                     source_x,
+                                                     source_y,
+                                                     sink_x,
+                                                     sink_y,
+                                                     router_opts,
+                                                     measure_directconnect);
+
+#ifdef VERBOSE
+                VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                        delay,
+                        delta_x, delta_y,
+                        source_x, source_y,
+                        sink_x, sink_y);
+#endif
+                if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
+                    //Overwrite empty delta
+                    matrix[delta_x][delta_y][0] = delay;
+                } else {
+                    //Collect delta
+                    matrix[delta_x][delta_y].push_back(delay);
+                }
+            }
+        }
+    }
+}
+
+static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*route_profiler*/,
+                                                      vtr::Matrix<std::vector<float>>& matrix,
+                                                      int from_layer_num,
+                                                      int to_layer_num,
+                                                      int source_x,
+                                                      int source_y,
+                                                      int start_x,
+                                                      int start_y,
+                                                      int end_x,
+                                                      int end_y,
+                                                      const t_router_opts& router_opts,
+                                                      bool measure_directconnect,
+                                                      const std::set<std::string>& allowed_types,
+                                                      bool is_flat) {
+    auto& device_ctx = g_vpr_ctx.device();
+
+    t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
+    bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
+    if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) {
+        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+                int delta_x = abs(sink_x - source_x);
+                int delta_y = abs(sink_y - source_y);
+
+                if (matrix[delta_x][delta_y].empty()) {
+                    //Only set empty target if we don't already have a valid delta delay
+                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                            "EMPTY",
+                            delta_x, delta_y,
+                            source_x, source_y,
+                            sink_x, sink_y);
+#endif
+                }
+            }
+        }
+
+        return;
+    }
+
+    vtr::Matrix<bool> found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false);
+
+    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
+    for (int driver_ptc : best_driver_ptcs) {
+        VTR_ASSERT(driver_ptc != OPEN);
+        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
+
+        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
+        auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat);
+
+        bool path_to_all_sinks = true;
+        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+                int delta_x = abs(sink_x - source_x);
+                int delta_y = abs(sink_y - source_y);
+
+                if (found_matrix[delta_x][delta_y]) {
+                    continue;
+                }
+
+                t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
+                if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                    if (matrix[delta_x][delta_y].empty()) {
+                        //Only set empty target if we don't already have a valid delta delay
+                        matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                        VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                                "EMPTY",
+                                delta_x, delta_y,
+                                source_x, source_y,
+                                sink_x, sink_y);
+#endif
+                        found_matrix[delta_x][delta_y] = true;
+                    }
+                } else {
+                    bool found_a_sink = false;
+                    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
+                    for (int sink_ptc : best_sink_ptcs) {
+                        VTR_ASSERT(sink_ptc != OPEN);
+                        RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
+
+                        if (sink_rr_node == RRNodeId::INVALID())
+                            continue;
+
+                        if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
+                            //Skip if we shouldn't measure direct connects and a direct connect exists
+                            continue;
+                        }
+
+                        if (std::isnan(delays[sink_rr_node])) {
+                            // This sink was not found
+                            continue;
+                        }
+
+#ifdef VERBOSE
+                        VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                                delays[size_t(sink_rr_node)],
+                                delta_x, delta_y,
+                                source_x, source_y,
+                                sink_x, sink_y);
+#endif
+                        found_matrix[delta_x][delta_y] = true;
+
+                        add_delay_to_matrix(&matrix, delta_x, delta_y, delays[sink_rr_node]);
+
+                        found_a_sink = true;
+                        break;
+                    }
+
+                    if (!found_a_sink) {
+                        path_to_all_sinks = false;
+                    }
+                }
+            }
+        }
+
+        if (path_to_all_sinks) {
+            break;
+        }
+    }
+
+    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+            int delta_x = abs(sink_x - source_x);
+            int delta_y = abs(sink_y - source_y);
+            if (!found_matrix[delta_x][delta_y]) {
+                add_delay_to_matrix(&matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
+                VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
+                             source_x,
+                             source_y,
+                             from_layer_num,
+                             sink_x,
+                             sink_y,
+                             to_layer_num,
+                             IMPOSSIBLE_DELTA);
+            }
+        }
+    }
+}
+
+static float route_connection_delay(RouterDelayProfiler& route_profiler,
+                                    int from_layer_num,
+                                    int to_layer_num,
+                                    int source_x,
+                                    int source_y,
+                                    int sink_x,
+                                    int sink_y,
+                                    const t_router_opts& router_opts,
+                                    bool measure_directconnect) {
+    //Routes between the source and sink locations and calculates the delay
+
+    float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */
+
+    auto& device_ctx = g_vpr_ctx.device();
+
+    bool successfully_routed = false;
+
+    //Get the rr nodes to route between
+    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
+    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
+
+    for (int driver_ptc : best_driver_ptcs) {
+        VTR_ASSERT(driver_ptc != OPEN);
+        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
+
+        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
+
+        for (int sink_ptc : best_sink_ptcs) {
+            VTR_ASSERT(sink_ptc != OPEN);
+            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
+
+            if (sink_rr_node == RRNodeId::INVALID())
+                continue;
+
+            if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
+                //Skip if we shouldn't measure direct connects and a direct connect exists
+                continue;
+            }
+
+            {
+                successfully_routed = route_profiler.calculate_delay(
+                    source_rr_node, sink_rr_node,
+                    router_opts,
+                    &net_delay_value);
+            }
+
+            if (successfully_routed) break;
+        }
+        if (successfully_routed) break;
+    }
+
+    if (!successfully_routed) {
+        VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
+                     source_x, source_y, from_layer_num, sink_x, sink_y, to_layer_num, net_delay_value);
+    }
+
+    return net_delay_value;
+}
+
+float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
+    if (delays.empty()) {
+        return IMPOSSIBLE_DELTA;
+    } else if (delays.size() == 1) {
+        return delays[0];
+    }
+
+    VTR_ASSERT(delays.size() > 1);
+
+    float delay;
+
+    if (reducer == e_reducer::MIN) {
+        auto itr = std::min_element(delays.begin(), delays.end());
+        delay = *itr;
+    } else if (reducer == e_reducer::MAX) {
+        auto itr = std::max_element(delays.begin(), delays.end());
+        delay = *itr;
+    } else if (reducer == e_reducer::MEDIAN) {
+        std::stable_sort(delays.begin(), delays.end());
+        delay = vtr::median(delays.begin(), delays.end());
+    } else if (reducer == e_reducer::ARITHMEAN) {
+        delay = vtr::arithmean(delays.begin(), delays.end());
+    } else if (reducer == e_reducer::GEOMEAN) {
+        delay = vtr::geomean(delays.begin(), delays.end());
+    } else {
+        VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer");
+    }
+
+    return delay;
+}
+
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>* matrix,
+                                int delta_x,
+                                int delta_y,
+                                float delay) {
+    if ((*matrix)[delta_x][delta_y].size() == 1 && (*matrix)[delta_x][delta_y][0] == EMPTY_DELTA) {
+        //Overwrite empty delta
+        (*matrix)[delta_x][delta_y][0] = delay;
+    } else {
+        //Collect delta
+        (*matrix)[delta_x][delta_y].push_back(delay);
+    }
+}
+
+/* We return the average placement estimated delay for a routing spanning (x,y).
+ * We start with an averaging distance of 1 (i.e. from (x-1,y-1) to (x+1,y+1))
+ * and look for legal delay values to average; if some are found we return the
+ * average and if none are found we increase the distance to average over.
+ *
+ * If no legal values are found to average over with a range of max_distance,
+ * we return IMPOSSIBLE_DELTA.
+ */
+static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
+                                      int from_layer,
+                                      t_physical_tile_loc to_tile_loc,
+                                      int max_distance) {
+    float sum = 0;
+    int counter = 0;
+    int endx = matrix.end_index(2);
+    int endy = matrix.end_index(3);
+
+    int x = to_tile_loc.x;
+    int y = to_tile_loc.y;
+    int to_layer = to_tile_loc.layer_num;
+
+    for (int distance = 1; distance <= max_distance; ++distance) {
+        for (int delx = x - distance; delx <= x + distance; delx++) {
+            for (int dely = y - distance; dely <= y + distance; dely++) {
+                // Check distance constraint
+                if (abs(delx - x) + abs(dely - y) > distance) {
+                    continue;
+                }
+
+                //check out of bounds
+                if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) {
+                    continue;
+                }
+
+                if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) {
+                    continue;
+                }
+                counter++;
+                sum += matrix[from_layer][to_layer][delx][dely];
+            }
+        }
+        if (counter != 0) {
+            return sum / (float)counter;
+        }
+    }
+
+    return IMPOSSIBLE_DELTA;
+}
+
+/***************************************************************************************/
+
+vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
+                                                  const t_placer_opts& placer_opts,
+                                                  const t_router_opts& router_opts,
+                                                  bool measure_directconnect,
+                                                  int longest_length,
+                                                  bool is_flat) {
+    vtr::ScopedStartFinishTimer timer("Computing delta delays");
+    vtr::NdMatrix<float, 4> delta_delays = compute_delta_delays(route_profiler,
+                                                                placer_opts,
+                                                                router_opts,
+                                                                measure_directconnect,
+                                                                longest_length,
+                                                                is_flat);
+
+    fix_uninitialized_coordinates(delta_delays);
+
+    fix_empty_coordinates(delta_delays);
+
+    fill_impossible_coordinates(delta_delays);
+
+    verify_delta_delays(delta_delays);
+
+    return delta_delays;
+}
+
+//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
+bool find_direct_connect_sample_locations(const t_direct_inf* direct,
+                                          t_physical_tile_type_ptr from_type,
+                                          int from_pin,
+                                          int from_pin_class,
+                                          t_physical_tile_type_ptr to_type,
+                                          int to_pin,
+                                          int to_pin_class,
+                                          RRNodeId& out_src_node,
+                                          RRNodeId& out_sink_node) {
+    VTR_ASSERT(from_type != nullptr);
+    VTR_ASSERT(to_type != nullptr);
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& grid = device_ctx.grid;
+    const auto& node_lookup = device_ctx.rr_graph.node_lookup();
+
+    //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets,
+    //and which has the appropriate pins
+    int from_x = -1;
+    int from_y = -1;
+    int from_sub_tile = -1;
+    int to_x = 0, to_y = 0, to_sub_tile = 0;
+    bool found = false;
+    int found_layer_num = -1;
+    //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums
+    for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) {
+        for (int x = 0; x < (int)grid.width() && !found; ++x) {
+            to_x = x + direct->x_offset;
+            if (to_x < 0 || to_x >= (int)grid.width()) continue;
+
+            for (int y = 0; y < (int)grid.height() && !found; ++y) {
+                if (grid.get_physical_type({x, y, layer_num}) != from_type) continue;
+
+                //Check that the from pin exists at this from location
+                //(with multi-width/height blocks pins may not exist at all locations)
+                bool from_pin_found = false;
+                if (direct->from_side != NUM_2D_SIDES) {
+                    RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side);
+                    from_pin_found = from_pin_rr.is_valid();
+                } else {
+                    from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty());
+                }
+                if (!from_pin_found) continue;
+
+                to_y = y + direct->y_offset;
+
+                if (to_y < 0 || to_y >= (int)grid.height()) continue;
+                if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue;
+
+                //Check that the from pin exists at this from location
+                //(with multi-width/height blocks pins may not exist at all locations)
+                bool to_pin_found = false;
+                if (direct->to_side != NUM_2D_SIDES) {
+                    RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side);
+                    to_pin_found = (to_pin_rr != RRNodeId::INVALID());
+                } else {
+                    to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty());
+                }
+                if (!to_pin_found) continue;
+
+                for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) {
+                    to_sub_tile = sub_tile_num + direct->sub_tile_offset;
+
+                    if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue;
+
+                    found = true;
+                    found_layer_num = layer_num;
+                    from_x = x;
+                    from_y = y;
+                    from_sub_tile = sub_tile_num;
+
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!found) {
+        return false;
+    }
+
+    //Now have a legal instance of this direct connect
+    VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type);
+    VTR_ASSERT(from_sub_tile < from_type->capacity);
+
+    VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type);
+    VTR_ASSERT(to_sub_tile < to_type->capacity);
+
+    VTR_ASSERT(from_x + direct->x_offset == to_x);
+    VTR_ASSERT(from_y + direct->y_offset == to_y);
+    VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile);
+
+    //
+    //Find a source/sink RR node associated with the pins of the direct
+    //
+
+    {
+        RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class);
+        VTR_ASSERT(src_rr_candidate);
+        out_src_node = src_rr_candidate;
+    }
+
+    {
+        RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class);
+        VTR_ASSERT(sink_rr_candidate);
+        out_sink_node = sink_rr_candidate;
+    }
+
+    return true;
+}
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
new file mode 100644
index 00000000000..bacff650334
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
@@ -0,0 +1,27 @@
+
+#pragma once
+
+#include "vtr_ndmatrix.h"
+#include "physical_types.h"
+#include "rr_graph_fwd.h"
+
+struct t_placer_opts;
+struct t_router_opts;
+class RouterDelayProfiler;
+
+vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
+                                                  const t_placer_opts& placer_opts,
+                                                  const t_router_opts& router_opts,
+                                                  bool measure_directconnect,
+                                                  int longest_length,
+                                                  bool is_flat);
+
+bool find_direct_connect_sample_locations(const t_direct_inf* direct,
+                                          t_physical_tile_type_ptr from_type,
+                                          int from_pin,
+                                          int from_pin_class,
+                                          t_physical_tile_type_ptr to_type,
+                                          int to_pin,
+                                          int to_pin_class,
+                                          RRNodeId& out_src_node,
+                                          RRNodeId& out_sink_node);
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.cpp b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
index 55bb0104316..f4e202e7106 100644
--- a/vpr/src/place/timing/delay_model/delta_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
@@ -1,6 +1,20 @@
 
 #include "delta_delay_model.h"
 
+#include "compute_delta_delays_utils.h"
+
+void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
+                              const t_placer_opts& placer_opts,
+                              const t_router_opts& router_opts,
+                              int longest_length) {
+    delays_ = compute_delta_delay_model(route_profiler,
+                                        placer_opts,
+                                        router_opts,
+                                        /*measure_directconnect=*/true,
+                                        longest_length,
+                                        is_flat_);
+}
+
 float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/,
                              const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
     int delta_x = std::abs(from_loc.x - to_loc.x);
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
index ceb8245511b..33106acb208 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -1,6 +1,8 @@
 
 #include "override_delay_model.h"
 
+#include "compute_delta_delays_utils.h"
+
 #ifdef VTR_ENABLE_CAPNPROTO
 #    include "capnp/serialize.h"
 #    include "place_delay_model.capnp.h"
@@ -9,6 +11,99 @@
 #    include "serdes_utils.h"
 #endif  // VTR_ENABLE_CAPNPROTO
 
+void OverrideDelayModel::compute(RouterDelayProfiler& route_profiler,
+                                 const t_placer_opts& placer_opts,
+                                 const t_router_opts& router_opts,
+                                 int longest_length) {
+    auto delays = compute_delta_delay_model(route_profiler,
+                                            placer_opts,
+                                            router_opts,
+                                            /*measure_directconnect=*/false,
+                                            longest_length,
+                                            is_flat_);
+
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
+
+    compute_override_delay_model(route_profiler, router_opts);
+}
+
+void OverrideDelayModel::compute_override_delay_model(RouterDelayProfiler& route_profiler,
+                                                      const t_router_opts& router_opts) {
+    t_router_opts router_opts2 = router_opts;
+    router_opts2.astar_fac = 0.f;
+    router_opts2.astar_offset = 0.f;
+
+    //Look at all the direct connections that exist, and add overrides to delay model
+    auto& device_ctx = g_vpr_ctx.device();
+    for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) {
+        const t_direct_inf* direct = &device_ctx.arch->directs[idirect];
+
+        InstPort from_port = parse_inst_port(direct->from_pin);
+        InstPort to_port = parse_inst_port(direct->to_pin);
+
+        t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types);
+        t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types);
+
+        int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1;
+        VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from");
+
+        //We now walk through all the connections associated with the current direct specification, measure
+        //their delay and specify that value as an override in the delay model.
+        //
+        //Note that we need to check every connection in the direct to cover the case where the pins are not
+        //equivalent.
+        //
+        //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK
+        //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in
+        //sampled_rr_pairs and skipping them if they occur multiple times.
+        int missing_instances = 0;
+        int missing_paths = 0;
+        std::set<std::pair<RRNodeId, RRNodeId>> sampled_rr_pairs;
+        for (int iconn = 0; iconn < num_conns; ++iconn) {
+            //Find the associated pins
+            int from_pin = find_pin(from_type, from_port.port_name(), from_port.port_low_index() + iconn);
+            int to_pin = find_pin(to_type, to_port.port_name(), to_port.port_low_index() + iconn);
+
+            VTR_ASSERT(from_pin != OPEN);
+            VTR_ASSERT(to_pin != OPEN);
+
+            int from_pin_class = find_pin_class(from_type, from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
+            VTR_ASSERT(from_pin_class != OPEN);
+
+            int to_pin_class = find_pin_class(to_type, to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
+            VTR_ASSERT(to_pin_class != OPEN);
+
+            bool found_sample_points;
+            RRNodeId src_rr, sink_rr;
+            found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr);
+
+            if (!found_sample_points) {
+                ++missing_instances;
+                continue;
+            }
+
+            //If some of the source/sink ports are logically equivalent we may have already
+            //sampled the associated source/sink pair and don't need to do so again
+            if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;
+
+            float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
+            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);
+
+            if (found_routing_path) {
+                set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
+            } else {
+                ++missing_paths;
+            }
+
+            //Record that we've sampled this pair of source and sink nodes
+            sampled_rr_pairs.insert({src_rr, sink_rr});
+        }
+
+        VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str());
+        VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str());
+    }
+}
+
 const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
     return base_delay_model_.get();
 }
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index 76b06bbc55b..21ff6d69cc6 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -47,107 +47,8 @@ constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity(); //Ind
 static t_chan_width setup_chan_width(const t_router_opts& router_opts,
                                      t_chan_width_dist chan_width_dist);
 
-static float route_connection_delay(
-    RouterDelayProfiler& route_profiler,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x_loc,
-    int source_y_loc,
-    int sink_x_loc,
-    int sink_y_loc,
-    const t_router_opts& router_opts,
-    bool measure_directconnect);
-
-// Prototype for computing delta delay matrix.
-typedef std::function<void(
-    RouterDelayProfiler&,
-    vtr::Matrix<std::vector<float>>&,
-    int,
-    int,
-    int,
-    int,
-    int,
-    int,
-    int,
-    int,
-    const t_router_opts&,
-    bool,
-    const std::set<std::string>&,
-    bool)>
-    t_compute_delta_delay_matrix;
-
-static void generic_compute_matrix_iterative_astar(
-    RouterDelayProfiler& route_profiler,
-    vtr::Matrix<std::vector<float>>& matrix,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x,
-    int source_y,
-    int start_x,
-    int start_y,
-    int end_x,
-    int end_y,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    const std::set<std::string>& allowed_types,
-    bool /***/);
-
-static void generic_compute_matrix_dijkstra_expansion(
-    RouterDelayProfiler& route_profiler,
-    vtr::Matrix<std::vector<float>>& matrix,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x,
-    int source_y,
-    int start_x,
-    int start_y,
-    int end_x,
-    int end_y,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    const std::set<std::string>& allowed_types,
-    bool is_flat);
-
-static vtr::NdMatrix<float, 4> compute_delta_delays(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& palcer_opts,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    size_t longest_length,
-    bool is_flat);
-
-float delay_reduce(std::vector<float>& delays, e_reducer reducer);
-
-static vtr::NdMatrix<float, 4> compute_delta_delay_model(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& placer_opts,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    int longest_length,
-    bool is_flat);
-
-static bool find_direct_connect_sample_locations(const t_direct_inf* direct,
-                                                 t_physical_tile_type_ptr from_type,
-                                                 int from_pin,
-                                                 int from_pin_class,
-                                                 t_physical_tile_type_ptr to_type,
-                                                 int to_pin,
-                                                 int to_pin_class,
-                                                 RRNodeId& out_src_node,
-                                                 RRNodeId& out_sink_node);
-
-static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays);
-
 static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf);
 
-static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
-static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
-
-static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
-                                      int from_layer,
-                                      t_physical_tile_loc to_tile_loc,
-                                      int max_distance);
-
 /******* Globally Accessible Functions **********/
 
 std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
@@ -205,33 +106,6 @@ std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts&
     return place_delay_model;
 }
 
-void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
-                              const t_placer_opts& placer_opts,
-                              const t_router_opts& router_opts,
-                              int longest_length) {
-    delays_ = compute_delta_delay_model(
-        route_profiler,
-        placer_opts, router_opts, /*measure_directconnect=*/true,
-        longest_length,
-        is_flat_);
-}
-
-void OverrideDelayModel::compute(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& placer_opts,
-    const t_router_opts& router_opts,
-    int longest_length) {
-    auto delays = compute_delta_delay_model(
-        route_profiler,
-        placer_opts, router_opts, /*measure_directconnect=*/false,
-        longest_length,
-        is_flat_);
-
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
-
-    compute_override_delay_model(route_profiler, router_opts);
-}
-
 /******* File Accessible Functions **********/
 
 std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
@@ -334,902 +208,6 @@ static t_chan_width setup_chan_width(const t_router_opts& router_opts,
     return init_chan(width_fac, chan_width_dist, graph_directionality);
 }
 
-static float route_connection_delay(
-    RouterDelayProfiler& route_profiler,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x,
-    int source_y,
-    int sink_x,
-    int sink_y,
-    const t_router_opts& router_opts,
-    bool measure_directconnect) {
-    //Routes between the source and sink locations and calculates the delay
-
-    float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */
-
-    auto& device_ctx = g_vpr_ctx.device();
-
-    bool successfully_routed = false;
-
-    //Get the rr nodes to route between
-    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
-    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
-
-    for (int driver_ptc : best_driver_ptcs) {
-        VTR_ASSERT(driver_ptc != OPEN);
-        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
-
-        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
-
-        for (int sink_ptc : best_sink_ptcs) {
-            VTR_ASSERT(sink_ptc != OPEN);
-            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
-
-            if (sink_rr_node == RRNodeId::INVALID())
-                continue;
-
-            if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                //Skip if we shouldn't measure direct connects and a direct connect exists
-                continue;
-            }
-
-            {
-                successfully_routed = route_profiler.calculate_delay(
-                    source_rr_node, sink_rr_node,
-                    router_opts,
-                    &net_delay_value);
-            }
-
-            if (successfully_routed) break;
-        }
-        if (successfully_routed) break;
-    }
-
-    if (!successfully_routed) {
-        VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
-                     source_x, source_y, from_layer_num, sink_x, sink_y, to_layer_num, net_delay_value);
-    }
-
-    return (net_delay_value);
-}
-
-static void add_delay_to_matrix(
-    vtr::Matrix<std::vector<float>>* matrix,
-    int delta_x,
-    int delta_y,
-    float delay) {
-    if ((*matrix)[delta_x][delta_y].size() == 1 && (*matrix)[delta_x][delta_y][0] == EMPTY_DELTA) {
-        //Overwrite empty delta
-        (*matrix)[delta_x][delta_y][0] = delay;
-    } else {
-        //Collect delta
-        (*matrix)[delta_x][delta_y].push_back(delay);
-    }
-}
-
-static void generic_compute_matrix_dijkstra_expansion(
-    RouterDelayProfiler& /*route_profiler*/,
-    vtr::Matrix<std::vector<float>>& matrix,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x,
-    int source_y,
-    int start_x,
-    int start_y,
-    int end_x,
-    int end_y,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    const std::set<std::string>& allowed_types,
-    bool is_flat) {
-    auto& device_ctx = g_vpr_ctx.device();
-
-    t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
-    bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
-    if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) {
-        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-                int delta_x = abs(sink_x - source_x);
-                int delta_y = abs(sink_y - source_y);
-
-                if (matrix[delta_x][delta_y].empty()) {
-                    //Only set empty target if we don't already have a valid delta delay
-                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                            "EMPTY",
-                            delta_x, delta_y,
-                            source_x, source_y,
-                            sink_x, sink_y);
-#endif
-                }
-            }
-        }
-
-        return;
-    }
-
-    vtr::Matrix<bool> found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false);
-
-    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
-    for (int driver_ptc : best_driver_ptcs) {
-        VTR_ASSERT(driver_ptc != OPEN);
-        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
-
-        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
-        auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat);
-
-        bool path_to_all_sinks = true;
-        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-                int delta_x = abs(sink_x - source_x);
-                int delta_y = abs(sink_y - source_y);
-
-                if (found_matrix[delta_x][delta_y]) {
-                    continue;
-                }
-
-                t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
-                if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                    if (matrix[delta_x][delta_y].empty()) {
-                        //Only set empty target if we don't already have a valid delta delay
-                        matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                        VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                                "EMPTY",
-                                delta_x, delta_y,
-                                source_x, source_y,
-                                sink_x, sink_y);
-#endif
-                        found_matrix[delta_x][delta_y] = true;
-                    }
-                } else {
-                    bool found_a_sink = false;
-                    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
-                    for (int sink_ptc : best_sink_ptcs) {
-                        VTR_ASSERT(sink_ptc != OPEN);
-                        RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
-
-                        if (sink_rr_node == RRNodeId::INVALID())
-                            continue;
-
-                        if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                            //Skip if we shouldn't measure direct connects and a direct connect exists
-                            continue;
-                        }
-
-                        if (std::isnan(delays[sink_rr_node])) {
-                            // This sink was not found
-                            continue;
-                        }
-
-#ifdef VERBOSE
-                        VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                                delays[size_t(sink_rr_node)],
-                                delta_x, delta_y,
-                                source_x, source_y,
-                                sink_x, sink_y);
-#endif
-                        found_matrix[delta_x][delta_y] = true;
-
-                        add_delay_to_matrix(&matrix, delta_x, delta_y, delays[sink_rr_node]);
-
-                        found_a_sink = true;
-                        break;
-                    }
-
-                    if (!found_a_sink) {
-                        path_to_all_sinks = false;
-                    }
-                }
-            }
-        }
-
-        if (path_to_all_sinks) {
-            break;
-        }
-    }
-
-    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-            int delta_x = abs(sink_x - source_x);
-            int delta_y = abs(sink_y - source_y);
-            if (!found_matrix[delta_x][delta_y]) {
-                add_delay_to_matrix(&matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
-                VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
-                             source_x,
-                             source_y,
-                             from_layer_num,
-                             sink_x, 
-                             sink_y,
-                             to_layer_num,
-                             IMPOSSIBLE_DELTA);
-            }
-        }
-    }
-}
-
-static void generic_compute_matrix_iterative_astar(
-    RouterDelayProfiler& route_profiler,
-    vtr::Matrix<std::vector<float>>& matrix,
-    int from_layer_num,
-    int to_layer_num,
-    int source_x,
-    int source_y,
-    int start_x,
-    int start_y,
-    int end_x,
-    int end_y,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    const std::set<std::string>& allowed_types,
-    bool /***/) {
-    //vtr::ScopedStartFinishTimer t(vtr::string_fmt("Profiling from (%d,%d)", source_x, source_y));
-
-    int delta_x, delta_y;
-    int sink_x, sink_y;
-
-    auto& device_ctx = g_vpr_ctx.device();
-
-    for (sink_x = start_x; sink_x <= end_x; sink_x++) {
-        for (sink_y = start_y; sink_y <= end_y; sink_y++) {
-            delta_x = abs(sink_x - source_x);
-            delta_y = abs(sink_y - source_y);
-
-            t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
-            t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
-
-            bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE
-                                        || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE);
-
-            bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
-
-            if (src_or_target_empty || !is_allowed_type) {
-                if (matrix[delta_x][delta_y].empty()) {
-                    //Only set empty target if we don't already have a valid delta delay
-                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                            "EMPTY",
-                            delta_x, delta_y,
-                            source_x, source_y,
-                            sink_x, sink_y);
-#endif
-                }
-            } else {
-                //Valid start/end
-
-                float delay = route_connection_delay(route_profiler,
-                                                     from_layer_num,
-                                                     to_layer_num,
-                                                     source_x,
-                                                     source_y,
-                                                     sink_x,
-                                                     sink_y,
-                                                     router_opts,
-                                                     measure_directconnect);
-
-#ifdef VERBOSE
-                VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                        delay,
-                        delta_x, delta_y,
-                        source_x, source_y,
-                        sink_x, sink_y);
-#endif
-                if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
-                    //Overwrite empty delta
-                    matrix[delta_x][delta_y][0] = delay;
-                } else {
-                    //Collect delta
-                    matrix[delta_x][delta_y].push_back(delay);
-                }
-            }
-        }
-    }
-}
-
-static vtr::NdMatrix<float, 4> compute_delta_delays(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& placer_opts,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    size_t longest_length,
-    bool is_flat) {
-    //To avoid edge effects we place the source at least 'longest_length' away
-    //from the device edge
-    //and route from there for all possible delta values < dimension
-
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-
-    vtr::NdMatrix<float, 4> delta_delays({static_cast<unsigned long>(grid.get_num_layers()), static_cast<unsigned long>(grid.get_num_layers()), grid.width(), grid.height()});
-
-    for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); from_layer_num++) {
-        for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); to_layer_num++) {
-            vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({grid.width(), grid.height()});
-
-            size_t mid_x = vtr::nint(grid.width() / 2);
-            size_t mid_y = vtr::nint(grid.height() / 2);
-
-            size_t low_x = std::min(longest_length, mid_x);
-            size_t low_y = std::min(longest_length, mid_y);
-            size_t high_x = mid_x;
-            size_t high_y = mid_y;
-            if (longest_length <= grid.width()) {
-                high_x = std::max(grid.width() - longest_length, mid_x);
-            }
-            if (longest_length <= grid.height()) {
-                high_y = std::max(grid.height() - longest_length, mid_y);
-            }
-
-            std::set<std::string> allowed_types;
-            if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
-                auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
-                for (const auto& type : allowed_types_vector) {
-                    allowed_types.insert(type);
-                }
-            }
-
-            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-            //   +                 |                       |               +
-            //   +        A        |           B           |       C       +
-            //   +                 |                       |               +
-            //   +-----------------\-----------------------.---------------+
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +        D        |           E           |       F       +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +-----------------*-----------------------/---------------+
-            //   +                 |                       |               +
-            //   +        G        |           H           |       I       +
-            //   +                 |                       |               +
-            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-            //
-            //   * = (low_x, low_y)
-            //   . = (high_x, high_y)
-            //   / = (high_x, low_y)
-            //   \ = (low_x, high_y)
-            //   + = device edge
-
-            //Find the lowest y location on the left edge with a non-empty block
-            int y = 0;
-            int x = 0;
-            t_physical_tile_type_ptr src_type = nullptr;
-            for (x = 0; x < (int)grid.width(); ++x) {
-                for (y = 0; y < (int)grid.height(); ++y) {
-                    auto type = grid.get_physical_type({x, y, from_layer_num});
-
-                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
-                            continue;
-                        }
-                        src_type = type;
-                        break;
-                    }
-                }
-                if (src_type) {
-                    break;
-                }
-            }
-            VTR_ASSERT(src_type != nullptr);
-
-            t_compute_delta_delay_matrix generic_compute_matrix;
-            switch (placer_opts.place_delta_delay_matrix_calculation_method) {
-                case e_place_delta_delay_algorithm::ASTAR_ROUTE:
-                    generic_compute_matrix = generic_compute_matrix_iterative_astar;
-                    break;
-                case e_place_delta_delay_algorithm::DIJKSTRA_EXPANSION:
-                    generic_compute_matrix = generic_compute_matrix_dijkstra_expansion;
-                    break;
-                default:
-                    VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unknown place_delta_delay_matrix_calculation_method %d", placer_opts.place_delta_delay_matrix_calculation_method);
-            }
-
-#ifdef VERBOSE
-            VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y);
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   x, y,
-                                   x, y,
-                                   grid.width() - 1, grid.height() - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Find the lowest x location on the bottom edge with a non-empty block
-            src_type = nullptr;
-            for (y = 0; y < (int)grid.height(); ++y) {
-                for (x = 0; x < (int)grid.width(); ++x) {
-                    auto type = grid.get_physical_type({x, y, from_layer_num});
-
-                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
-                            continue;
-                        }
-                        src_type = type;
-                        break;
-                    }
-                }
-                if (src_type) {
-                    break;
-                }
-            }
-            VTR_ASSERT(src_type != nullptr);
-#ifdef VERBOSE
-            VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y);
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   x, y,
-                                   x, y,
-                                   grid.width() - 1, grid.height() - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions B, C, E, F
-#ifdef VERBOSE
-            VTR_LOG("Computing from low/low:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   low_x, low_y,
-                                   low_x, low_y,
-                                   grid.width() - 1, grid.height() - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions D, E, G, H
-#ifdef VERBOSE
-            VTR_LOG("Computing from high/high:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   high_x, high_y,
-                                   0, 0,
-                                   high_x, high_y,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions A, B, D, E
-#ifdef VERBOSE
-            VTR_LOG("Computing from high/low:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   high_x, low_y,
-                                   0, low_y,
-                                   high_x, grid.height() - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions E, F, H, I
-#ifdef VERBOSE
-            VTR_LOG("Computing from low/high:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   low_x, high_y,
-                                   low_x, 0,
-                                   grid.width() - 1, high_y,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-            for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) {
-                for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) {
-                    delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer);
-                }
-            }
-        }
-    }
-
-    return delta_delays;
-}
-
-float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
-    if (delays.empty()) {
-        return IMPOSSIBLE_DELTA;
-    } else if (delays.size() == 1) {
-        return delays[0];
-    }
-
-    VTR_ASSERT(delays.size() > 1);
-
-    float delay;
-
-    if (reducer == e_reducer::MIN) {
-        auto itr = std::min_element(delays.begin(), delays.end());
-        delay = *itr;
-    } else if (reducer == e_reducer::MAX) {
-        auto itr = std::max_element(delays.begin(), delays.end());
-        delay = *itr;
-    } else if (reducer == e_reducer::MEDIAN) {
-        std::stable_sort(delays.begin(), delays.end());
-        delay = vtr::median(delays.begin(), delays.end());
-    } else if (reducer == e_reducer::ARITHMEAN) {
-        delay = vtr::arithmean(delays.begin(), delays.end());
-    } else if (reducer == e_reducer::GEOMEAN) {
-        delay = vtr::geomean(delays.begin(), delays.end());
-    } else {
-        VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer");
-    }
-
-    return delay;
-}
-
-/* We return the average placement estimated delay for a routing spanning (x,y).
- * We start with an averaging distance of 1 (i.e. from (x-1,y-1) to (x+1,y+1))
- * and look for legal delay values to average; if some are found we return the
- * average and if none are found we increase the distance to average over.
- *
- * If no legal values are found to average over with a range of max_distance,
- * we return IMPOSSIBLE_DELTA.
- */
-static float find_neighboring_average(
-    vtr::NdMatrix<float, 4>& matrix,
-    int from_layer,
-    t_physical_tile_loc to_tile_loc,
-    int max_distance) {
-    float sum = 0;
-    int counter = 0;
-    int endx = matrix.end_index(2);
-    int endy = matrix.end_index(3);
-
-    int x = to_tile_loc.x;
-    int y = to_tile_loc.y;
-    int to_layer = to_tile_loc.layer_num;
-
-    for (int distance = 1; distance <= max_distance; ++distance) {
-        for (int delx = x - distance; delx <= x + distance; delx++) {
-            for (int dely = y - distance; dely <= y + distance; dely++) {
-                // Check distance constraint
-                if (abs(delx - x) + abs(dely - y) > distance) {
-                    continue;
-                }
-
-                //check out of bounds
-                if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) {
-                    continue;
-                }
-
-                if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) {
-                    continue;
-                }
-                counter++;
-                sum += matrix[from_layer][to_layer][delx][dely];
-            }
-        }
-        if (counter != 0) {
-            return sum / (float)counter;
-        }
-    }
-
-    return IMPOSSIBLE_DELTA;
-}
-
-static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any empty delta's to the average of it's neighbours
-    //
-    // Empty coordinates may occur if the sampling location happens to not have
-    // a connection at that location.  However a more through sampling likely
-    // would return a result, so we fill in the empty holes with a small
-    // neighbour average.
-    constexpr int kMaxAverageDistance = 2;
-    for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) {
-        for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) {
-            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
-                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) {
-                        delta_delays[from_layer][to_layer][delta_x][delta_y] =
-                            find_neighboring_average(delta_delays,
-                                                     from_layer,
-                                                     {delta_x, delta_y, to_layer},
-                                                     kMaxAverageDistance);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any empty delta's to the average of it's neighbours
-
-    for (size_t from_layer_num = 0; from_layer_num < delta_delays.dim_size(0); ++from_layer_num) {
-        for (size_t to_layer_num = 0; to_layer_num < delta_delays.dim_size(1); ++to_layer_num) {
-            for (size_t delta_x = 0; delta_x < delta_delays.dim_size(2); ++delta_x) {
-                for (size_t delta_y = 0; delta_y < delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == UNINITIALIZED_DELTA) {
-                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = IMPOSSIBLE_DELTA;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any impossible delta's to the average of its neighbours
-    //
-    // Impossible coordinates may occur if an IPIN cannot be reached from the
-    // sampling OPIN.  This might occur if the IPIN or OPIN used for sampling
-    // is specialized, and therefore cannot be reached via the by the pins
-    // sampled.  Leaving this value in the delay matrix will result in invalid
-    // slacks if the delay matrix uses this value.
-    //
-    // A max average distance of 5 is used to provide increased effort in
-    // filling these gaps.  It is more important to have a poor predication,
-    // than an invalid value and causing a slack assertion.
-    constexpr int kMaxAverageDistance = 5;
-    for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) {
-        for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) {
-            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
-                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) {
-                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average(
-                            delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static vtr::NdMatrix<float, 4> compute_delta_delay_model(
-    RouterDelayProfiler& route_profiler,
-    const t_placer_opts& placer_opts,
-    const t_router_opts& router_opts,
-    bool measure_directconnect,
-    int longest_length,
-    bool is_flat) {
-    vtr::ScopedStartFinishTimer timer("Computing delta delays");
-    vtr::NdMatrix<float, 4> delta_delays = compute_delta_delays(route_profiler,
-                                                                placer_opts,
-                                                                router_opts,
-                                                                measure_directconnect,
-                                                                longest_length,
-                                                                is_flat);
-
-    fix_uninitialized_coordinates(delta_delays);
-
-    fix_empty_coordinates(delta_delays);
-
-    fill_impossible_coordinates(delta_delays);
-
-    verify_delta_delays(delta_delays);
-
-    return delta_delays;
-}
-
-
-
-//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
-static bool find_direct_connect_sample_locations(const t_direct_inf* direct,
-                                                 t_physical_tile_type_ptr from_type,
-                                                 int from_pin,
-                                                 int from_pin_class,
-                                                 t_physical_tile_type_ptr to_type,
-                                                 int to_pin,
-                                                 int to_pin_class,
-                                                 RRNodeId& out_src_node,
-                                                 RRNodeId& out_sink_node) {
-    VTR_ASSERT(from_type != nullptr);
-    VTR_ASSERT(to_type != nullptr);
-
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-    const auto& node_lookup = device_ctx.rr_graph.node_lookup();
-
-    //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets,
-    //and which has the appropriate pins
-    int from_x = -1;
-    int from_y = -1;
-    int from_sub_tile = -1;
-    int to_x = 0, to_y = 0, to_sub_tile = 0;
-    bool found = false;
-    int found_layer_num = -1;
-    //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums
-    for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) {
-        for (int x = 0; x < (int)grid.width() && !found; ++x) {
-            to_x = x + direct->x_offset;
-            if (to_x < 0 || to_x >= (int)grid.width()) continue;
-
-            for (int y = 0; y < (int)grid.height() && !found; ++y) {
-                if (grid.get_physical_type({x, y, layer_num}) != from_type) continue;
-
-                //Check that the from pin exists at this from location
-                //(with multi-width/height blocks pins may not exist at all locations)
-                bool from_pin_found = false;
-                if (direct->from_side != NUM_2D_SIDES) {
-                    RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side);
-                    from_pin_found = from_pin_rr.is_valid();
-                } else {
-                    from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty());
-                }
-                if (!from_pin_found) continue;
-
-                to_y = y + direct->y_offset;
-
-                if (to_y < 0 || to_y >= (int)grid.height()) continue;
-                if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue;
-
-                //Check that the from pin exists at this from location
-                //(with multi-width/height blocks pins may not exist at all locations)
-                bool to_pin_found = false;
-                if (direct->to_side != NUM_2D_SIDES) {
-                    RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side);
-                    to_pin_found = (to_pin_rr != RRNodeId::INVALID());
-                } else {
-                    to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty());
-                }
-                if (!to_pin_found) continue;
-
-                for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) {
-                    to_sub_tile = sub_tile_num + direct->sub_tile_offset;
-
-                    if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue;
-
-                    found = true;
-                    found_layer_num = layer_num;
-                    from_x = x;
-                    from_y = y;
-                    from_sub_tile = sub_tile_num;
-
-                    break;
-                }
-            }
-        }
-    }
-
-    if (!found) {
-        return false;
-    }
-
-    //Now have a legal instance of this direct connect
-    VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type);
-    VTR_ASSERT(from_sub_tile < from_type->capacity);
-
-    VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type);
-    VTR_ASSERT(to_sub_tile < to_type->capacity);
-
-    VTR_ASSERT(from_x + direct->x_offset == to_x);
-    VTR_ASSERT(from_y + direct->y_offset == to_y);
-    VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile);
-
-    //
-    //Find a source/sink RR node associated with the pins of the direct
-    //
-
-    {
-        RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class);
-        VTR_ASSERT(src_rr_candidate);
-        out_src_node = src_rr_candidate;
-    }
-
-    {
-        RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class);
-        VTR_ASSERT(sink_rr_candidate);
-        out_sink_node = sink_rr_candidate;
-    }
-
-    return true;
-}
-
-static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-
-    for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) {
-        for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) {
-            for (size_t x = 0; x < grid.width(); ++x) {
-                for (size_t y = 0; y < grid.height(); ++y) {
-                    float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y];
-
-                    if (delta_delay < 0.) {
-                        VPR_ERROR(VPR_ERROR_PLACE,
-                                  "Found invaild negative delay %g for delta [%d,%d,%d,%d]",
-                                  delta_delay, from_layer_num, to_layer_num, x, y);
-                    }
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-void OverrideDelayModel::compute_override_delay_model(
-    RouterDelayProfiler& route_profiler,
-    const t_router_opts& router_opts) {
-    t_router_opts router_opts2 = router_opts;
-    router_opts2.astar_fac = 0.f;
-    router_opts2.astar_offset = 0.f;
-
-    //Look at all the direct connections that exist, and add overrides to delay model
-    auto& device_ctx = g_vpr_ctx.device();
-    for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) {
-        const t_direct_inf* direct = &device_ctx.arch->directs[idirect];
-
-        InstPort from_port = parse_inst_port(direct->from_pin);
-        InstPort to_port = parse_inst_port(direct->to_pin);
-
-        t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types);
-        t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types);
-
-        int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1;
-        VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from");
-
-        //We now walk through all the connections associated with the current direct specification, measure
-        //their delay and specify that value as an override in the delay model.
-        //
-        //Note that we need to check every connection in the direct to cover the case where the pins are not
-        //equivalent.
-        //
-        //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK
-        //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in
-        //sampled_rr_pairs and skipping them if they occur multiple times.
-        int missing_instances = 0;
-        int missing_paths = 0;
-        std::set<std::pair<RRNodeId, RRNodeId>> sampled_rr_pairs;
-        for (int iconn = 0; iconn < num_conns; ++iconn) {
-            //Find the associated pins
-            int from_pin = find_pin(from_type, from_port.port_name(), from_port.port_low_index() + iconn);
-            int to_pin = find_pin(to_type, to_port.port_name(), to_port.port_low_index() + iconn);
-
-            VTR_ASSERT(from_pin != OPEN);
-            VTR_ASSERT(to_pin != OPEN);
-
-            int from_pin_class = find_pin_class(from_type, from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
-            VTR_ASSERT(from_pin_class != OPEN);
-
-            int to_pin_class = find_pin_class(to_type, to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
-            VTR_ASSERT(to_pin_class != OPEN);
-
-            bool found_sample_points;
-            RRNodeId src_rr, sink_rr;
-            found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr);
-
-            if (!found_sample_points) {
-                ++missing_instances;
-                continue;
-            }
-
-            //If some of the source/sink ports are logically equivalent we may have already
-            //sampled the associated source/sink pair and don't need to do so again
-            if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;
-
-            float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
-            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);
-
-            if (found_routing_path) {
-                set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
-            } else {
-                ++missing_paths;
-            }
-
-            //Record that we've sampled this pair of source and sink nodes
-            sampled_rr_pairs.insert({src_rr, sink_rr});
-        }
-
-        VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str());
-        VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str());
-    }
-}
-
 bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) {
     //Returns true if there is a directconnect between the two RR nodes
     //
@@ -1258,4 +236,4 @@ bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) {
         }
     }
     return false;
-}
+}
\ No newline at end of file

From cddb15210d4ba99f1a07a4ca8f83e779cd14976f Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 12:49:58 -0500
Subject: [PATCH 05/39] add doxygen comments for delay_reduce,
 add_delay_to_matrix, and find_neighboring_average

---
 .../compute_delta_delays_utils.cpp            | 97 +++++++++++++------
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index 78855a251b6..ee7da1b2265 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -69,13 +69,56 @@ static float route_connection_delay(RouterDelayProfiler& route_profiler,
                                     const t_router_opts& router_opts,
                                     bool measure_directconnect);
 
-float delay_reduce(std::vector<float>& delays, e_reducer reducer);
+/**
+ * @brief Computes a reduced value from a vector of delay values using the specified reduction method.
+ *
+ * @param delays A reference to a vector of delay values. This vector may be modified
+ *               (e.g., sorted) depending on the reducer used.
+ * @param reducer The reduction method to be applied.
+ *
+ * @return The reduced delay value. If the input vector is empty, the function
+ *         returns `IMPOSSIBLE_DELTA`.
+ *
+ * @throws VPR_FATAL_ERROR if the reducer is unrecognized.
+ */
+static float delay_reduce(std::vector<float>& delays, e_reducer reducer);
 
-static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>* matrix,
+/**
+ * @brief Adds a delay value to a 2D matrix of delay vectors.
+ *
+ * Updates the delay vector at position (`delta_x`, `delta_y`) in the matrix.
+ * If the element contains only `EMPTY_DELTA`, it is replaced with the new delay;
+ * otherwise, the delay is appended to the vector.
+ *
+ * @param matrix A 2D matrix of delay vectors.
+ * @param delta_x The x-index in the matrix.
+ * @param delta_y The y-index in the matrix.
+ * @param delay The delay value to add.
+ */
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
                                 int delta_x,
                                 int delta_y,
                                 float delay);
 
+/**
+ * @brief Computes the average delay for a routing span.
+ *
+ * This function calculates the average placement delay for a routing span starting from a
+ * given layer and spanning a region defined by delta x and delta y. It iteratively searches
+ * for valid delay values within an expanding neighborhood  (starting from a distance of 1)
+ * around the specified delta offsets and layer, until valid  values are found or
+ * the maximum search distance (`max_distance`) is reached.
+ *
+ * @param matrix A 4D matrix of delay values indexed by `[from_layer][to_layer][delta_x][delta_y]`.
+ * @param from_layer The starting layer index of the routing span.
+ * @param to_tile_loc A structure holding the delta offsets (`x` and `y`) and the target layer index (`layer_num`).
+ * @param max_distance The maximum neighborhood distance to search for valid delay values.
+ *
+ * @return The average of valid delay values within the search range. If no valid delays
+ *         are found up to the maximum distance, the function returns `IMPOSSIBLE_DELTA`.
+ *
+ * @note The function performs a Manhattan-distance-based neighborhood search around the target location.
+ */
 static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
                                       int from_layer,
                                       t_physical_tile_loc to_tile_loc,
@@ -560,7 +603,7 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
 #endif
                         found_matrix[delta_x][delta_y] = true;
 
-                        add_delay_to_matrix(&matrix, delta_x, delta_y, delays[sink_rr_node]);
+                        add_delay_to_matrix(matrix, delta_x, delta_y, delays[sink_rr_node]);
 
                         found_a_sink = true;
                         break;
@@ -583,7 +626,7 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
             int delta_x = abs(sink_x - source_x);
             int delta_y = abs(sink_y - source_y);
             if (!found_matrix[delta_x][delta_y]) {
-                add_delay_to_matrix(&matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
+                add_delay_to_matrix(matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
                 VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
                              source_x,
                              source_y,
@@ -656,10 +699,12 @@ static float route_connection_delay(RouterDelayProfiler& route_profiler,
     return net_delay_value;
 }
 
-float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
+static float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
     if (delays.empty()) {
         return IMPOSSIBLE_DELTA;
-    } else if (delays.size() == 1) {
+    }
+
+    if (delays.size() == 1) {
         return delays[0];
     }
 
@@ -687,39 +732,31 @@ float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
     return delay;
 }
 
-static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>* matrix,
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
                                 int delta_x,
                                 int delta_y,
                                 float delay) {
-    if ((*matrix)[delta_x][delta_y].size() == 1 && (*matrix)[delta_x][delta_y][0] == EMPTY_DELTA) {
-        //Overwrite empty delta
-        (*matrix)[delta_x][delta_y][0] = delay;
+    if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
+        // Overwrite empty delta
+        matrix[delta_x][delta_y][0] = delay;
     } else {
         //Collect delta
-        (*matrix)[delta_x][delta_y].push_back(delay);
+        matrix[delta_x][delta_y].push_back(delay);
     }
 }
 
-/* We return the average placement estimated delay for a routing spanning (x,y).
- * We start with an averaging distance of 1 (i.e. from (x-1,y-1) to (x+1,y+1))
- * and look for legal delay values to average; if some are found we return the
- * average and if none are found we increase the distance to average over.
- *
- * If no legal values are found to average over with a range of max_distance,
- * we return IMPOSSIBLE_DELTA.
- */
 static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
                                       int from_layer,
                                       t_physical_tile_loc to_tile_loc,
                                       int max_distance) {
-    float sum = 0;
-    int counter = 0;
-    int endx = matrix.end_index(2);
-    int endy = matrix.end_index(3);
+    float sum = 0.f;
+    int num_samples = 0;
+    const int endx = matrix.end_index(2);
+    const int endy = matrix.end_index(3);
 
-    int x = to_tile_loc.x;
-    int y = to_tile_loc.y;
-    int to_layer = to_tile_loc.layer_num;
+    const int x = to_tile_loc.x;
+    const int y = to_tile_loc.y;
+    const int to_layer = to_tile_loc.layer_num;
 
     for (int distance = 1; distance <= max_distance; ++distance) {
         for (int delx = x - distance; delx <= x + distance; delx++) {
@@ -737,12 +774,14 @@ static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
                 if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) {
                     continue;
                 }
-                counter++;
+
                 sum += matrix[from_layer][to_layer][delx][dely];
+                num_samples++;
             }
         }
-        if (counter != 0) {
-            return sum / (float)counter;
+
+        if (num_samples != 0) {
+            return sum / (float)num_samples;
         }
     }
 

From 553ff537454aade8148de6ab62e04e4a285521ee Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 15:50:53 -0500
Subject: [PATCH 06/39] move lines that don't depend on loop vars to outside
 the loop

---
 .../compute_delta_delays_utils.cpp            | 151 ++++++++----------
 1 file changed, 69 insertions(+), 82 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index ee7da1b2265..42142d428cc 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -132,9 +132,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                                     bool measure_directconnect,
                                                     size_t longest_length,
                                                     bool is_flat) {
-    /* To avoid edge effects we place the source at least 'longest_length' away
-     * from the device edge and route from there for all possible delta values < dimension
-     */
+
 
     const auto& device_ctx = g_vpr_ctx.device();
     const auto& grid = device_ctx.grid;
@@ -143,69 +141,64 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
     const size_t device_width = grid.width();
     const size_t device_height = grid.height();
 
+    /* To avoid edge effects we place the source at least 'longest_length' away
+     * from the device edge and route from there for all possible delta values < dimension
+     */
+
+    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    //   +                 |                       |               +
+    //   +        A        |           B           |       C       +
+    //   +                 |                       |               +
+    //   +-----------------\-----------------------.---------------+
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +        D        |           E           |       F       +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +-----------------*-----------------------/---------------+
+    //   +                 |                       |               +
+    //   +        G        |           H           |       I       +
+    //   +                 |                       |               +
+    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    //
+    //   * = (low_x, low_y)
+    //   . = (high_x, high_y)
+    //   / = (high_x, low_y)
+    //   \ = (low_x, high_y)
+    //   + = device edge
+    const size_t mid_x = vtr::nint(device_width / 2);
+    const size_t mid_y = vtr::nint(device_height / 2);
+    const size_t low_x = std::min(longest_length, mid_x);
+    const size_t low_y = std::min(longest_length, mid_y);
+    const size_t high_x = (longest_length <= device_width)  ? std::max(device_width - longest_length, mid_x) : mid_x;
+    const size_t high_y = (longest_length <= device_height) ? std::max(device_width - longest_length, mid_y) : mid_y;
+
     vtr::NdMatrix<float, 4> delta_delays({num_layers, num_layers, device_width, device_height});
 
+    std::set<std::string> allowed_types;
+    if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
+        auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
+        allowed_types = std::set(allowed_types_vector.begin(), allowed_types_vector.end());
+    }
+
     for (int from_layer_num = 0; from_layer_num < (int)num_layers; from_layer_num++) {
         for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) {
             vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({device_width, device_height});
 
-            size_t mid_x = vtr::nint(device_width / 2);
-            size_t mid_y = vtr::nint(device_height / 2);
-
-            size_t low_x = std::min(longest_length, mid_x);
-            size_t low_y = std::min(longest_length, mid_y);
-            size_t high_x = mid_x;
-            size_t high_y = mid_y;
-            if (longest_length <= device_width) {
-                high_x = std::max(device_width - longest_length, mid_x);
-            }
-            if (longest_length <= device_height) {
-                high_y = std::max(device_width - longest_length, mid_y);
-            }
-
-            std::set<std::string> allowed_types;
-            if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
-                auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
-                for (const auto& type : allowed_types_vector) {
-                    allowed_types.insert(type);
-                }
-            }
-
-            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-            //   +                 |                       |               +
-            //   +        A        |           B           |       C       +
-            //   +                 |                       |               +
-            //   +-----------------\-----------------------.---------------+
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +        D        |           E           |       F       +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +                 |                       |               +
-            //   +-----------------*-----------------------/---------------+
-            //   +                 |                       |               +
-            //   +        G        |           H           |       I       +
-            //   +                 |                       |               +
-            //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-            //
-            //   * = (low_x, low_y)
-            //   . = (high_x, high_y)
-            //   / = (high_x, low_y)
-            //   \ = (low_x, high_y)
-            //   + = device edge
-
             //Find the lowest y location on the left edge with a non-empty block
             int y = 0;
             int x = 0;
             t_physical_tile_type_ptr src_type = nullptr;
-            for (x = 0; x < (int)grid.width(); ++x) {
-                for (y = 0; y < (int)grid.height(); ++y) {
-                    auto type = grid.get_physical_type({x, y, from_layer_num});
+            for (x = 0; x < (int)device_width; ++x) {
+                for (y = 0; y < (int)device_height; ++y) {
+                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
 
                     if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        // check if the tile type is among the allowed types
                         if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
                             continue;
                         }
@@ -228,7 +221,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    from_layer_num, to_layer_num,
                                    x, y,
                                    x, y,
-                                   grid.width() - 1, grid.height() - 1,
+                                   device_width - 1, device_height - 1,
                                    router_opts,
                                    measure_directconnect, allowed_types,
                                    is_flat);
@@ -237,9 +230,10 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
             src_type = nullptr;
             for (y = 0; y < (int)grid.height(); ++y) {
                 for (x = 0; x < (int)grid.width(); ++x) {
-                    auto type = grid.get_physical_type({x, y, from_layer_num});
+                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
 
                     if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        // check if the tile type is among the allowed types
                         if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
                             continue;
                         }
@@ -259,7 +253,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    from_layer_num, to_layer_num,
                                    x, y,
                                    x, y,
-                                   grid.width() - 1, grid.height() - 1,
+                                   device_width - 1, device_height - 1,
                                    router_opts,
                                    measure_directconnect, allowed_types,
                                    is_flat);
@@ -273,7 +267,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    from_layer_num, to_layer_num,
                                    low_x, low_y,
                                    low_x, low_y,
-                                   grid.width() - 1, grid.height() - 1,
+                                   device_width - 1, device_height - 1,
                                    router_opts,
                                    measure_directconnect, allowed_types,
                                    is_flat);
@@ -301,7 +295,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    from_layer_num, to_layer_num,
                                    high_x, low_y,
                                    0, low_y,
-                                   high_x, grid.height() - 1,
+                                   high_x, device_height - 1,
                                    router_opts,
                                    measure_directconnect, allowed_types,
                                    is_flat);
@@ -315,7 +309,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    from_layer_num, to_layer_num,
                                    low_x, high_y,
                                    low_x, 0,
-                                   grid.width() - 1, high_y,
+                                   device_width - 1, high_y,
                                    router_opts,
                                    measure_directconnect, allowed_types,
                                    is_flat);
@@ -331,8 +325,6 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
 }
 
 static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any empty delta's to the average of it's neighbours
-
     for (size_t from_layer_num = 0; from_layer_num < delta_delays.dim_size(0); ++from_layer_num) {
         for (size_t to_layer_num = 0; to_layer_num < delta_delays.dim_size(1); ++to_layer_num) {
             for (size_t delta_x = 0; delta_x < delta_delays.dim_size(2); ++delta_x) {
@@ -437,15 +429,12 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
                                                    bool /*is_flat*/) {
     //vtr::ScopedStartFinishTimer t(vtr::string_fmt("Profiling from (%d,%d)", source_x, source_y));
 
-    int delta_x, delta_y;
-    int sink_x, sink_y;
-
-    auto& device_ctx = g_vpr_ctx.device();
+    const auto& device_ctx = g_vpr_ctx.device();
 
-    for (sink_x = start_x; sink_x <= end_x; sink_x++) {
-        for (sink_y = start_y; sink_y <= end_y; sink_y++) {
-            delta_x = abs(sink_x - source_x);
-            delta_y = abs(sink_y - source_y);
+    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+            const int delta_x = abs(sink_x - source_x);
+            const int delta_y = abs(sink_y - source_y);
 
             t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
             t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
@@ -457,7 +446,7 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
 
             if (src_or_target_empty || !is_allowed_type) {
                 if (matrix[delta_x][delta_y].empty()) {
-                    //Only set empty target if we don't already have a valid delta delay
+                    // Only set empty target if we don't already have a valid delta delay
                     matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
 #ifdef VERBOSE
                     VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
@@ -488,10 +477,10 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
                         sink_x, sink_y);
 #endif
                 if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
-                    //Overwrite empty delta
+                    // Overwrite empty delta
                     matrix[delta_x][delta_y][0] = delay;
                 } else {
-                    //Collect delta
+                    // Collect delta
                     matrix[delta_x][delta_y].push_back(delay);
                 }
             }
@@ -653,7 +642,7 @@ static float route_connection_delay(RouterDelayProfiler& route_profiler,
 
     float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */
 
-    auto& device_ctx = g_vpr_ctx.device();
+    const auto& device_ctx = g_vpr_ctx.device();
 
     bool successfully_routed = false;
 
@@ -675,16 +664,14 @@ static float route_connection_delay(RouterDelayProfiler& route_profiler,
                 continue;
 
             if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                //Skip if we shouldn't measure direct connects and a direct connect exists
+                // Skip if we shouldn't measure direct connects and a direct connect exists
                 continue;
             }
 
-            {
-                successfully_routed = route_profiler.calculate_delay(
-                    source_rr_node, sink_rr_node,
-                    router_opts,
-                    &net_delay_value);
-            }
+            successfully_routed = route_profiler.calculate_delay(source_rr_node,
+                                                                 sink_rr_node,
+                                                                 router_opts,
+                                                                 &net_delay_value);
 
             if (successfully_routed) break;
         }
@@ -740,7 +727,7 @@ static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
         // Overwrite empty delta
         matrix[delta_x][delta_y][0] = delay;
     } else {
-        //Collect delta
+        // Collect delta
         matrix[delta_x][delta_y].push_back(delay);
     }
 }

From 7d4fd0100892dc077745c5ed58b30391acc5e541 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 15:56:17 -0500
Subject: [PATCH 07/39] remove fix_uninitialized_coordinates

---
 .../compute_delta_delays_utils.cpp            | 31 +++++++------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index 42142d428cc..d50bfcf1991 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -21,8 +21,6 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                                     size_t longest_length,
                                                     bool is_flat);
 
-static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
-
 static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
 
 static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
@@ -324,20 +322,6 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
     return delta_delays;
 }
 
-static void fix_uninitialized_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    for (size_t from_layer_num = 0; from_layer_num < delta_delays.dim_size(0); ++from_layer_num) {
-        for (size_t to_layer_num = 0; to_layer_num < delta_delays.dim_size(1); ++to_layer_num) {
-            for (size_t delta_x = 0; delta_x < delta_delays.dim_size(2); ++delta_x) {
-                for (size_t delta_y = 0; delta_y < delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == UNINITIALIZED_DELTA) {
-                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = IMPOSSIBLE_DELTA;
-                    }
-                }
-            }
-        }
-    }
-}
-
 static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
     // Set any empty delta's to the average of it's neighbours
     //
@@ -391,8 +375,8 @@ static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
 }
 
 static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
 
     for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) {
         for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) {
@@ -402,7 +386,7 @@ static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
 
                     if (delta_delay < 0.) {
                         VPR_ERROR(VPR_ERROR_PLACE,
-                                  "Found invaild negative delay %g for delta [%d,%d,%d,%d]",
+                                  "Found invalid negative delay %g for delta [%d,%d,%d,%d]",
                                   delta_delay, from_layer_num, to_layer_num, x, y);
                     }
                 }
@@ -791,7 +775,14 @@ vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_pro
                                                                 longest_length,
                                                                 is_flat);
 
-    fix_uninitialized_coordinates(delta_delays);
+    const size_t num_elements = delta_delays.size();
+
+    // set uninitialized elements to infinity
+    for (size_t i = 0; i < num_elements; i++) {
+        if (delta_delays.get(i) == UNINITIALIZED_DELTA) {
+            delta_delays.get(i) = IMPOSSIBLE_DELTA;
+        }
+    }
 
     fix_empty_coordinates(delta_delays);
 

From 9ce28bfd56c9246e7494c09be23e466add56c1f2 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 16:17:58 -0500
Subject: [PATCH 08/39] doxygen comments for get_best_classes and
 route_connection_delay

---
 .../compute_delta_delays_utils.cpp            | 60 ++++++++++++-------
 vpr/src/place/timing_place_lookup.cpp         | 18 ++----
 vpr/src/place/timing_place_lookup.h           | 27 +++++++++
 3 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index d50bfcf1991..0feaf0cc702 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -57,13 +57,32 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& route
                                                       const std::set<std::string>& allowed_types,
                                                       bool is_flat);
 
+/**
+ * @brief Routes between a source and sink location to calculate the delay.
+ *
+ * This function computes the delay of a routed connection between a source and sink node
+ * specified by their coordinates and layers. It iterates over the best driver and sink pin
+ * classes to find a valid routing path and calculates the delay if a path exists.
+ *
+ * @param route_profiler Reference to the `RouterDelayProfiler` responsible for calculating routing delays.
+ * @param source_x The x-coordinate of the source location.
+ * @param source_y The y-coordinate of the source location.
+ * @param source_layer The layer index of the source node.
+ * @param sink_x The x-coordinate of the sink location.
+ * @param sink_y The y-coordinate of the sink location.
+ * @param sink_layer The layer index of the sink node.
+ * @param router_opts Routing options used for delay calculation.
+ * @param measure_directconnect If `true`, includes direct connect delays; otherwise, skips direct connections.
+ *
+ * @return The calculated routing delay. If routing fails, it returns `IMPOSSIBLE_DELTA`.
+ */
 static float route_connection_delay(RouterDelayProfiler& route_profiler,
-                                    int from_layer_num,
-                                    int to_layer_num,
-                                    int source_x_loc,
-                                    int source_y_loc,
-                                    int sink_x_loc,
-                                    int sink_y_loc,
+                                    int source_x,
+                                    int source_y,
+                                    int source_layer,
+                                    int sink_x,
+                                    int sink_y,
+                                    int sink_layer,
                                     const t_router_opts& router_opts,
                                     bool measure_directconnect);
 
@@ -323,10 +342,10 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
 }
 
 static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any empty delta's to the average of it's neighbours
+    // Set any empty delta's to the average of its neighbours
     //
     // Empty coordinates may occur if the sampling location happens to not have
-    // a connection at that location.  However a more through sampling likely
+    // a connection at that location. However, a more thorough sampling likely
     // would return a result, so we fill in the empty holes with a small
     // neighbour average.
     constexpr int kMaxAverageDistance = 2;
@@ -411,8 +430,6 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
                                                    bool measure_directconnect,
                                                    const std::set<std::string>& allowed_types,
                                                    bool /*is_flat*/) {
-    //vtr::ScopedStartFinishTimer t(vtr::string_fmt("Profiling from (%d,%d)", source_x, source_y));
-
     const auto& device_ctx = g_vpr_ctx.device();
 
     for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
@@ -444,12 +461,12 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
                 //Valid start/end
 
                 float delay = route_connection_delay(route_profiler,
-                                                     from_layer_num,
-                                                     to_layer_num,
                                                      source_x,
                                                      source_y,
+                                                     from_layer_num,
                                                      sink_x,
                                                      sink_y,
+                                                     to_layer_num,
                                                      router_opts,
                                                      measure_directconnect);
 
@@ -614,35 +631,36 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
 }
 
 static float route_connection_delay(RouterDelayProfiler& route_profiler,
-                                    int from_layer_num,
-                                    int to_layer_num,
                                     int source_x,
                                     int source_y,
+                                    int source_layer,
                                     int sink_x,
                                     int sink_y,
+                                    int sink_layer,
                                     const t_router_opts& router_opts,
                                     bool measure_directconnect) {
     //Routes between the source and sink locations and calculates the delay
 
-    float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */
+    // set to known value for debug purposes
+    float net_delay_value = IMPOSSIBLE_DELTA;
 
     const auto& device_ctx = g_vpr_ctx.device();
 
     bool successfully_routed = false;
 
-    //Get the rr nodes to route between
-    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
-    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
+    // Get the rr nodes to route between
+    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, source_layer}));
+    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, sink_layer}));
 
     for (int driver_ptc : best_driver_ptcs) {
         VTR_ASSERT(driver_ptc != OPEN);
-        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
+        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(source_layer, source_x, source_y, SOURCE, driver_ptc);
 
         VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
 
         for (int sink_ptc : best_sink_ptcs) {
             VTR_ASSERT(sink_ptc != OPEN);
-            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
+            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(sink_layer, sink_x, sink_y, SINK, sink_ptc);
 
             if (sink_rr_node == RRNodeId::INVALID())
                 continue;
@@ -664,7 +682,7 @@ static float route_connection_delay(RouterDelayProfiler& route_profiler,
 
     if (!successfully_routed) {
         VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
-                     source_x, source_y, from_layer_num, sink_x, sink_y, to_layer_num, net_delay_value);
+                     source_x, source_y, source_layer, sink_x, sink_y, sink_layer, net_delay_value);
     }
 
     return net_delay_value;
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index 21ff6d69cc6..f2a0a60edb9 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -109,22 +109,11 @@ std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts&
 /******* File Accessible Functions **********/
 
 std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
-    /*
-     * This function tries to identify the best pin classes to hook up
-     * for delay calculation.  The assumption is that we should pick
-     * the pin class with the largest number of pins. This makes
-     * sense, since it ensures we pick commonly used pins, and
-     * removes order dependence on how the pins are specified
-     * in the architecture (except in the case were the two largest pin classes
-     * of a particular pintype have the same number of pins, in which case the
-     * first pin class is used).
-     */
-
     std::vector<int> best_classes;
 
     //Record any non-zero Fc pins
     //
-    //Note that we track non-zero Fc pins, since certain Fc overides
+    //Note that we track non-zero Fc pins, since certain Fc overrides
     //may apply to only a subset of wire types. This ensures we record
     //which pins can potentially connect to global routing.
     std::unordered_set<int> non_zero_fc_pins;
@@ -149,14 +138,15 @@ std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_
                 }
             }
 
-            if (!any_pins_connect_to_general_routing) continue; //Skip if doesn't connect to general routing
+            //Skip if the pin class doesn't connect to general routing
+            if (!any_pins_connect_to_general_routing) continue;
 
             //Record candidate class
             best_classes.push_back(i);
         }
     }
 
-    //Sort classe so largest pin class is first
+    //Sort classes so the largest pin class is first
     auto cmp_class = [&](int lhs, int rhs) {
         return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
     };
diff --git a/vpr/src/place/timing_place_lookup.h b/vpr/src/place/timing_place_lookup.h
index fba3f470483..24cfc301ce6 100644
--- a/vpr/src/place/timing_place_lookup.h
+++ b/vpr/src/place/timing_place_lookup.h
@@ -11,6 +11,33 @@ std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts&
                                                            const std::vector<t_direct_inf>& directs,
                                                            bool is_flat);
 
+/**
+ * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity.
+ *
+ * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`)
+ * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins
+ * that connect to general routing, ensuring commonly used pins are chosen for delay profiling.
+ *
+ * @param pintype The type of pins to filter.
+ * @param type Pointer to the physical tile type containing pin and class information.
+ *
+ * @return A vector of indices representing the selected pin classes. The classes are sorted
+ *         in descending order based on the number of pins they contain.
+ *
+ * @details
+ * - A pin class is eligible if its type matches `pintype` and it contains at least one pin
+ *   that connects to general routing (non-zero Fc).
+ * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`.
+ * - Classes are sorted so that the class with the largest number of pins appears first.
+ *   If multiple classes have the same pin count, their order depends on their initial appearance
+ *   in the architecture file.
+ *
+ * @note
+ * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded.
+ * - The function ensures stability in sorting, preserving the input order for classes
+ *   with the same number of pins.
+ */
+
 std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type);
 
 bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node);

From f188b792e4e7ccf3f7c221ea7f6894876be4abb2 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Fri, 29 Nov 2024 16:54:28 -0500
Subject: [PATCH 09/39] remove unused includes and constants from
 timing_place_lookup.cpp

---
 utils/route_diag/src/main.cpp                 |  2 +-
 .../compute_delta_delays_utils.cpp            |  2 +-
 vpr/src/place/timing_place_lookup.cpp         | 24 ++-----------------
 3 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp
index debd89c8bd6..8b485916532 100644
--- a/utils/route_diag/src/main.cpp
+++ b/utils/route_diag/src/main.cpp
@@ -238,7 +238,7 @@ static void profile_source(const Netlist<>& net_list,
     VTR_LOG("\n");
 }
 
-static t_chan_width setup_chan_width(t_router_opts router_opts,
+static t_chan_width setup_chan_width(const t_router_opts& router_opts,
         t_chan_width_dist chan_width_dist) {
     /*we give plenty of tracks, this increases routability for the */
     /*lookup table generation */
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index 0feaf0cc702..4630ddfcfb4 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -503,7 +503,7 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
                                                       bool measure_directconnect,
                                                       const std::set<std::string>& allowed_types,
                                                       bool is_flat) {
-    auto& device_ctx = g_vpr_ctx.device();
+    const auto& device_ctx = g_vpr_ctx.device();
 
     t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
     bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index f2a0a60edb9..f086283a3e7 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -1,47 +1,27 @@
 
-#include <cmath>
-#include <limits>
+#include "timing_place_lookup.h"
 
 #include "rr_graph_fwd.h"
 #include "vtr_assert.h"
 #include "vtr_ndmatrix.h"
 #include "vtr_log.h"
 #include "vtr_util.h"
-#include "vtr_math.h"
-#include "vtr_memory.h"
+
 #include "vtr_time.h"
-#include "vtr_geometry.h"
 
-#include "arch_util.h"
 #include "vpr_types.h"
 #include "globals.h"
 #include "place_and_route.h"
 #include "route_net.h"
-#include "timing_place_lookup.h"
 #include "read_xml_arch_file.h"
 #include "atom_netlist.h"
 
-// all functions in profiling:: namespace, which are only activated if PROFILE is defined
-#include "route_profiling.h"
 #include "router_delay_profiling.h"
 #include "place_delay_model.h"
 #include "simple_delay_model.h"
 #include "delta_delay_model.h"
 #include "override_delay_model.h"
 
-/*To compute delay between blocks we calculate the delay between */
-/*different nodes in the FPGA.  From this procedure we generate
- * a lookup table which tells us the delay between different locations in*/
-/*the FPGA */
-
-/*the delta arrays are used to contain the best case routing delay */
-/*between different locations on the FPGA. */
-
-//#define VERBOSE
-
-constexpr float UNINITIALIZED_DELTA = -1;                                  //Indicates the delta delay value has not been calculated
-constexpr float EMPTY_DELTA = -2;                                          //Indicates delta delay from/to an EMPTY block
-constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity(); //Indicates there is no valid delta delay
 
 /*** Function Prototypes *****/
 static t_chan_width setup_chan_width(const t_router_opts& router_opts,

From 107738c78adc5c91ce1952d1bf45de133443bd1b Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 17:20:57 -0500
Subject: [PATCH 10/39] total_num_internal_pins member function for t_sub_tile

---
 libs/libarchfpga/src/arch_util.h             |  6 +++---
 libs/libarchfpga/src/physical_types.cpp      | 18 ++++++++++++++++-
 libs/libarchfpga/src/physical_types.h        |  3 +++
 libs/libarchfpga/src/physical_types_util.cpp | 21 ++++++--------------
 libs/libarchfpga/src/physical_types_util.h   |  6 ++----
 5 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/libs/libarchfpga/src/arch_util.h b/libs/libarchfpga/src/arch_util.h
index c39cf77b94f..fb251bffe10 100644
--- a/libs/libarchfpga/src/arch_util.h
+++ b/libs/libarchfpga/src/arch_util.h
@@ -23,8 +23,8 @@ class InstPort {
 
     InstPort() = default;
     InstPort(const std::string& str);
-    std::string instance_name() const { return instance_.name; }
-    std::string port_name() const { return port_.name; }
+    const std::string& instance_name() const { return instance_.name; }
+    const std::string& port_name() const { return port_.name; }
 
     int instance_low_index() const { return instance_.low_idx; }
     int instance_high_index() const { return instance_.high_idx; }
@@ -40,7 +40,7 @@ class InstPort {
 
   private:
     struct name_index {
-        std::string name = "";
+        std::string name;
         int low_idx = UNSPECIFIED;
         int high_idx = UNSPECIFIED;
     };
diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp
index 3bdabaee2a7..ff9baf87b3f 100644
--- a/libs/libarchfpga/src/physical_types.cpp
+++ b/libs/libarchfpga/src/physical_types.cpp
@@ -220,7 +220,7 @@ std::string t_pb_graph_pin::to_string(const bool full_description) const {
     return pin_string;
 }
 
-/**
+/*
  * t_pb_graph_edge
  */
 
@@ -253,3 +253,19 @@ bool t_pb_graph_edge::belongs_to_pattern(int pattern_index) const {
     // return false otherwise
     return false;
 }
+
+/*
+ * t_sub_tile
+ */
+
+int t_sub_tile::total_num_internal_pins() const {
+    int num_pins = 0;
+
+    for (t_logical_block_type_ptr eq_site : equivalent_sites) {
+        num_pins += (int)eq_site->pin_logical_num_to_pb_pin_mapping.size();
+    }
+
+    num_pins *= capacity.total();
+
+    return num_pins;
+}
diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index 4d415697554..bf306021d45 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -796,6 +796,9 @@ struct t_sub_tile {
     int num_phy_pins = 0;
 
     int index = -1;
+
+  public:
+    int total_num_internal_pins() const;
 };
 
 /** A logical pin defines the pin index of a logical block type (i.e. a top level PB type)
diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp
index 2256f81d66c..f23b2add270 100644
--- a/libs/libarchfpga/src/physical_types_util.cpp
+++ b/libs/libarchfpga/src/physical_types_util.cpp
@@ -154,7 +154,7 @@ static std::tuple<int, int, int, int, int> get_pin_index_for_inst(t_physical_til
         pin_inst_num = (pin_physical_num - pin_offset) % pins_per_inst;
     } else {
         int pin_offset = get_sub_tile_inst_physical_pin_num_offset(type, sub_tile, sub_tile_cap);
-        int pins_per_inst = get_total_num_sub_tile_internal_pins(sub_tile) / sub_tile->capacity.total();
+        int pins_per_inst = sub_tile->total_num_internal_pins() / sub_tile->capacity.total();
         pin_inst_num = (pin_physical_num - pin_offset) % pins_per_inst;
     }
 
@@ -225,7 +225,7 @@ static int get_sub_tile_physical_pin_num_offset(t_physical_tile_type_ptr physica
         if (&tmp_sub_tile == curr_sub_tile)
             break;
         else
-            offset += get_total_num_sub_tile_internal_pins(&tmp_sub_tile);
+            offset += tmp_sub_tile.total_num_internal_pins();
     }
 
     return offset;
@@ -235,7 +235,7 @@ static int get_sub_tile_inst_physical_pin_num_offset(t_physical_tile_type_ptr ph
                                                      const t_sub_tile* curr_sub_tile,
                                                      const int curr_relative_cap) {
     int offset = get_sub_tile_physical_pin_num_offset(physical_tile, curr_sub_tile);
-    int sub_tile_inst_num_pins = get_total_num_sub_tile_internal_pins(curr_sub_tile) / curr_sub_tile->capacity.total();
+    int sub_tile_inst_num_pins = curr_sub_tile->total_num_internal_pins() / curr_sub_tile->capacity.total();
 
     offset += (curr_relative_cap * sub_tile_inst_num_pins);
 
@@ -564,7 +564,7 @@ int get_max_num_pins(t_logical_block_type_ptr logical_block) {
 }
 
 //Returns the pin class associated with the specified pin_index_in_port within the port port_name on type
-int find_pin_class(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port, e_pin_type pin_type) {
+int find_pin_class(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port, e_pin_type pin_type) {
     int iclass = OPEN;
 
     int ipin = find_pin(type, port_name, pin_index_in_port);
@@ -579,7 +579,7 @@ int find_pin_class(t_physical_tile_type_ptr type, std::string port_name, int pin
     return iclass;
 }
 
-int find_pin(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port) {
+int find_pin(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port) {
     int ipin = OPEN;
     int port_base_ipin = 0;
     int num_pins = OPEN;
@@ -1009,7 +1009,7 @@ std::tuple<const t_sub_tile*, int> get_sub_tile_from_pin_physical_num(t_physical
     int pin_offset = total_pin_counts;
 
     for (auto& sub_tile : physical_tile->sub_tiles) {
-        int sub_tile_num_pins = pin_on_tile ? sub_tile.num_phy_pins : get_total_num_sub_tile_internal_pins(&sub_tile);
+        int sub_tile_num_pins = pin_on_tile ? sub_tile.num_phy_pins : sub_tile.total_num_internal_pins();
         total_pin_counts += sub_tile_num_pins;
 
         if (physical_num < total_pin_counts) {
@@ -1347,15 +1347,6 @@ const t_pb_graph_node* get_pb_graph_node_from_pin_physical_num(t_physical_tile_t
     return pb_graph_pin->parent_node;
 }
 
-int get_total_num_sub_tile_internal_pins(const t_sub_tile* sub_tile) {
-    int num_pins = 0;
-    for (auto eq_site : sub_tile->equivalent_sites) {
-        num_pins += (int)eq_site->pin_logical_num_to_pb_pin_mapping.size();
-    }
-    num_pins *= sub_tile->capacity.total();
-    return num_pins;
-}
-
 int get_tile_pin_max_ptc(t_physical_tile_type_ptr tile, bool is_flat) {
     if (is_flat) {
         return tile->num_pins + (int)tile->pin_num_to_pb_pin.size();
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index aa7b2617834..2a7ba563339 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -173,10 +173,10 @@ std::vector<std::string> block_type_class_index_to_pin_names(t_physical_tile_typ
 ///@brief Returns the physical tile type matching a given physical tile type name, or nullptr (if not found)
 t_physical_tile_type_ptr find_tile_type_by_name(const std::string& name, const std::vector<t_physical_tile_type>& types);
 
-int find_pin_class(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port, e_pin_type pin_type);
+int find_pin_class(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port, e_pin_type pin_type);
 
 ///@brief Returns the relative pin index within a sub tile that corresponds to the pin within the given port and its index in the port
-int find_pin(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port);
+int find_pin(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port);
 
 ///@brief Returns the maximum number of pins within a logical block
 int get_max_num_pins(t_logical_block_type_ptr logical_block);
@@ -434,8 +434,6 @@ int get_edge_sw_arch_idx(t_physical_tile_type_ptr physical_tile,
 const t_pb_graph_node* get_pb_graph_node_from_pin_physical_num(t_physical_tile_type_ptr physical_type,
                                                                int pin_physical_num);
 
-int get_total_num_sub_tile_internal_pins(const t_sub_tile* sub_tile);
-
 int get_tile_pin_max_ptc(t_physical_tile_type_ptr tile, bool is_flat);
 
 int get_tile_num_internal_pin(t_physical_tile_type_ptr tile);

From b06cceb5e3b4e78cb250d2de7d0309b92b312f02 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 17:24:09 -0500
Subject: [PATCH 11/39] make get_port_by_name() a member function of t_sub_tile

---
 libs/libarchfpga/src/physical_types.cpp      | 10 ++++++++++
 libs/libarchfpga/src/physical_types.h        |  5 +++++
 libs/libarchfpga/src/physical_types_util.cpp | 10 ----------
 libs/libarchfpga/src/physical_types_util.h   |  5 -----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp
index ff9baf87b3f..ac830d3a464 100644
--- a/libs/libarchfpga/src/physical_types.cpp
+++ b/libs/libarchfpga/src/physical_types.cpp
@@ -269,3 +269,13 @@ int t_sub_tile::total_num_internal_pins() const {
 
     return num_pins;
 }
+
+const t_physical_tile_port* t_sub_tile::get_port(std::string_view port_name) {
+    for (const t_physical_tile_port& port : ports) {
+        if (port_name == port.name) {
+            return &ports[port.index];
+        }
+    }
+
+    return nullptr;
+}
\ No newline at end of file
diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index bf306021d45..a46650347f5 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -799,6 +799,11 @@ struct t_sub_tile {
 
   public:
     int total_num_internal_pins() const;
+
+    /**
+     * @brief Returns the physical tile port given the port name and the corresponding sub tile
+     */
+    const t_physical_tile_port* get_port(std::string_view port_name);
 };
 
 /** A logical pin defines the pin index of a logical block type (i.e. a top level PB type)
diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp
index f23b2add270..d4fe4127928 100644
--- a/libs/libarchfpga/src/physical_types_util.cpp
+++ b/libs/libarchfpga/src/physical_types_util.cpp
@@ -841,16 +841,6 @@ std::vector<std::string> block_type_class_index_to_pin_names(t_physical_tile_typ
     return pin_names;
 }
 
-const t_physical_tile_port* get_port_by_name(t_sub_tile* sub_tile, const char* port_name) {
-    for (auto port : sub_tile->ports) {
-        if (0 == strcmp(port.name, port_name)) {
-            return &sub_tile->ports[port.index];
-        }
-    }
-
-    return nullptr;
-}
-
 const t_port* get_port_by_name(t_logical_block_type_ptr type, const char* port_name) {
     auto pb_type = type->pb_type;
 
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index 2a7ba563339..b5b28a79f99 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -286,11 +286,6 @@ int get_sub_tile_physical_pin(int sub_tile_index,
  */
 t_physical_tile_port find_tile_port_by_name(t_physical_tile_type_ptr type, std::string_view port_name);
 
-/**
- * @brief Returns the physical tile port given the port name and the corresponding sub tile
- */
-const t_physical_tile_port* get_port_by_name(t_sub_tile* sub_tile, const char* port_name);
-
 /**
  * @brief Returns the logical block port given the port name and the corresponding logical block type
  */

From 85dcb10ed5a59cccb8c3cca053e77ee9e34ffa2e Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 17:30:05 -0500
Subject: [PATCH 12/39] add get_port to t_logicl_block_type

---
 libs/libarchfpga/src/physical_types.cpp      | 11 +++++++++++
 libs/libarchfpga/src/physical_types.h        |  6 ++++++
 libs/libarchfpga/src/physical_types_util.cpp | 13 -------------
 libs/libarchfpga/src/physical_types_util.h   |  5 -----
 libs/libarchfpga/src/read_xml_arch_file.cpp  |  2 +-
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp
index ac830d3a464..8b189fd7021 100644
--- a/libs/libarchfpga/src/physical_types.cpp
+++ b/libs/libarchfpga/src/physical_types.cpp
@@ -144,6 +144,17 @@ bool t_logical_block_type::is_empty() const {
     return name == std::string(EMPTY_BLOCK_NAME);
 }
 
+const t_port* t_logical_block_type::get_port(std::string_view port_name) const {
+    for (int i = 0; i < pb_type->num_ports; i++) {
+        auto port = pb_type->ports[i];
+        if (port_name == port.name) {
+            return &pb_type->ports[port.index];
+        }
+    }
+
+    return nullptr;
+}
+
 /**
  * t_pb_graph_node
  */
diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index a46650347f5..c5f3d39093e 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -958,6 +958,12 @@ struct t_logical_block_type {
 
     // Is this t_logical_block_type empty?
     bool is_empty() const;
+
+  public:
+    /**
+     * @brief Returns the logical block port given the port name and the corresponding logical block type
+     */
+    const t_port* get_port(std::string_view port_name) const;
 };
 
 /*************************************************************************************************
diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp
index d4fe4127928..5d4edc65b21 100644
--- a/libs/libarchfpga/src/physical_types_util.cpp
+++ b/libs/libarchfpga/src/physical_types_util.cpp
@@ -841,19 +841,6 @@ std::vector<std::string> block_type_class_index_to_pin_names(t_physical_tile_typ
     return pin_names;
 }
 
-const t_port* get_port_by_name(t_logical_block_type_ptr type, const char* port_name) {
-    auto pb_type = type->pb_type;
-
-    for (int i = 0; i < pb_type->num_ports; i++) {
-        auto port = pb_type->ports[i];
-        if (0 == strcmp(port.name, port_name)) {
-            return &pb_type->ports[port.index];
-        }
-    }
-
-    return nullptr;
-}
-
 const t_physical_tile_port* get_port_by_pin(const t_sub_tile* sub_tile, int pin) {
     for (auto port : sub_tile->ports) {
         if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) {
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index b5b28a79f99..ae9405ef44c 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -286,11 +286,6 @@ int get_sub_tile_physical_pin(int sub_tile_index,
  */
 t_physical_tile_port find_tile_port_by_name(t_physical_tile_type_ptr type, std::string_view port_name);
 
-/**
- * @brief Returns the logical block port given the port name and the corresponding logical block type
- */
-const t_port* get_port_by_name(t_logical_block_type_ptr type, const char* port_name);
-
 /**
  * @brief Returns the physical tile port given the pin name and the corresponding sub tile
  */
diff --git a/libs/libarchfpga/src/read_xml_arch_file.cpp b/libs/libarchfpga/src/read_xml_arch_file.cpp
index 3950eb1b15b..46cde415630 100644
--- a/libs/libarchfpga/src/read_xml_arch_file.cpp
+++ b/libs/libarchfpga/src/read_xml_arch_file.cpp
@@ -774,7 +774,7 @@ static std::pair<int, int> ProcessPinString(pugi::xml_node Locations,
                        "No port name is present: %s\n", pin_loc_string);
     }
 
-    auto port = get_port_by_name(type, token.data);
+    auto port = type->get_port(token.data);
     if (port == nullptr) {
         archfpga_throw(loc_data.filename_c_str(), loc_data.line(Locations),
                        "Port %s for %s could not be found: %s\n",

From 296b589e374ef3b49fe6a1ca8f9a219316a9b7b1 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 17:43:23 -0500
Subject: [PATCH 13/39] add get_port_by_pin() to t_sub_tile and
 t_logical_block_type

---
 libs/libarchfpga/src/arch_check.cpp          |  6 ++---
 libs/libarchfpga/src/physical_types.cpp      | 21 +++++++++++++++++
 libs/libarchfpga/src/physical_types.h        | 10 ++++++++
 libs/libarchfpga/src/physical_types_util.cpp | 24 --------------------
 libs/libarchfpga/src/physical_types_util.h   | 10 --------
 5 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/libs/libarchfpga/src/arch_check.cpp b/libs/libarchfpga/src/arch_check.cpp
index c8fb00299c4..5360d6e4c02 100644
--- a/libs/libarchfpga/src/arch_check.cpp
+++ b/libs/libarchfpga/src/arch_check.cpp
@@ -32,7 +32,7 @@ bool check_model_clocks(t_model* model, const char* file, uint32_t line) {
 bool check_model_combinational_sinks(const t_model* model, const char* file, uint32_t line) {
     //Outputs should have no combinational sinks
     for (t_model_ports* port = model->outputs; port != nullptr; port = port->next) {
-        if (port->combinational_sink_ports.size() != 0) {
+        if (!port->combinational_sink_ports.empty()) {
             archfpga_throw(file, line,
                            "Model '%s' output port '%s' can not have combinational sink ports",
                            model->name, port->name);
@@ -114,9 +114,9 @@ void check_port_direct_mappings(t_physical_tile_type_ptr physical_tile, t_sub_ti
     }
 
     for (auto pin_map : pin_direct_map) {
-        auto block_port = get_port_by_pin(logical_block, pin_map.first.pin);
+        const t_port* block_port = logical_block->get_port_by_pin(pin_map.first.pin);
 
-        auto sub_tile_port = get_port_by_pin(sub_tile, pin_map.second.pin);
+        const t_physical_tile_port* sub_tile_port = sub_tile->get_port_by_pin(pin_map.second.pin);
 
         VTR_ASSERT(block_port != nullptr);
         VTR_ASSERT(sub_tile_port != nullptr);
diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp
index 8b189fd7021..79619d11df4 100644
--- a/libs/libarchfpga/src/physical_types.cpp
+++ b/libs/libarchfpga/src/physical_types.cpp
@@ -155,6 +155,17 @@ const t_port* t_logical_block_type::get_port(std::string_view port_name) const {
     return nullptr;
 }
 
+const t_port* t_logical_block_type::get_port_by_pin(int pin) const {
+    for (int i = 0; i < pb_type->num_ports; i++) {
+        const t_port& port = pb_type->ports[i];
+        if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) {
+            return &pb_type->ports[port.index];
+        }
+    }
+
+    return nullptr;
+}
+
 /**
  * t_pb_graph_node
  */
@@ -288,5 +299,15 @@ const t_physical_tile_port* t_sub_tile::get_port(std::string_view port_name) {
         }
     }
 
+    return nullptr;
+}
+
+const t_physical_tile_port* t_sub_tile::get_port_by_pin(int pin) const {
+    for (const t_physical_tile_port& port : ports) {
+        if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) {
+            return &ports[port.index];
+        }
+    }
+
     return nullptr;
 }
\ No newline at end of file
diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index c5f3d39093e..a2fc676e305 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -804,6 +804,11 @@ struct t_sub_tile {
      * @brief Returns the physical tile port given the port name and the corresponding sub tile
      */
     const t_physical_tile_port* get_port(std::string_view port_name);
+
+    /**
+     * @brief Returns the physical tile port given the pin name and the corresponding sub tile
+     */
+    const t_physical_tile_port* get_port_by_pin(int pin) const;
 };
 
 /** A logical pin defines the pin index of a logical block type (i.e. a top level PB type)
@@ -964,6 +969,11 @@ struct t_logical_block_type {
      * @brief Returns the logical block port given the port name and the corresponding logical block type
      */
     const t_port* get_port(std::string_view port_name) const;
+
+    /**
+     * @brief Returns the logical block port given the pin name and the corresponding logical block type
+     */
+    const t_port* get_port_by_pin(int pin) const;
 };
 
 /*************************************************************************************************
diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp
index 5d4edc65b21..1374a7f7055 100644
--- a/libs/libarchfpga/src/physical_types_util.cpp
+++ b/libs/libarchfpga/src/physical_types_util.cpp
@@ -841,29 +841,6 @@ std::vector<std::string> block_type_class_index_to_pin_names(t_physical_tile_typ
     return pin_names;
 }
 
-const t_physical_tile_port* get_port_by_pin(const t_sub_tile* sub_tile, int pin) {
-    for (auto port : sub_tile->ports) {
-        if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) {
-            return &sub_tile->ports[port.index];
-        }
-    }
-
-    return nullptr;
-}
-
-const t_port* get_port_by_pin(t_logical_block_type_ptr type, int pin) {
-    auto pb_type = type->pb_type;
-
-    for (int i = 0; i < pb_type->num_ports; i++) {
-        auto port = pb_type->ports[i];
-        if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) {
-            return &pb_type->ports[port.index];
-        }
-    }
-
-    return nullptr;
-}
-
 /* Access information related to pin classes */
 
 /** get information given class physical num **/
@@ -1506,4 +1483,3 @@ std::map<int, int> get_sink_choking_points(t_physical_tile_type_ptr physical_til
 
     return choking_point;
 }
-/* */
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index ae9405ef44c..8d2637ef048 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -286,16 +286,6 @@ int get_sub_tile_physical_pin(int sub_tile_index,
  */
 t_physical_tile_port find_tile_port_by_name(t_physical_tile_type_ptr type, std::string_view port_name);
 
-/**
- * @brief Returns the physical tile port given the pin name and the corresponding sub tile
- */
-const t_physical_tile_port* get_port_by_pin(const t_sub_tile* sub_tile, int pin);
-
-/**
- * @brief Returns the logical block port given the pin name and the corresponding logical block type
- */
-const t_port* get_port_by_pin(t_logical_block_type_ptr type, int pin);
-
 /************************************ Access to intra-block resources ************************************/
 
 /* Access information related to pin classes */

From c246372717e6f0a927ffbe1f37d371aeacb37532 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 18:43:37 -0500
Subject: [PATCH 14/39] add PlacementDelayModelCreator class

---
 utils/route_diag/src/main.cpp                 |  50 +----
 vpr/src/base/place_and_route.cpp              |  30 +++
 vpr/src/base/place_and_route.h                |   6 +-
 vpr/src/noc/noc_routing_algorithm_creator.h   |   7 +-
 vpr/src/place/place.cpp                       |  18 +-
 .../PlacementDelayModelCreator.cpp            |  80 +++++++
 .../delay_model/PlacementDelayModelCreator.h  |  31 +++
 .../compute_delta_delays_utils.cpp            |  55 ++++-
 .../delay_model/compute_delta_delays_utils.h  |  31 ++-
 .../timing/delay_model/place_delay_model.cpp  |  24 +-
 .../timing/delay_model/place_delay_model.h    |  10 -
 vpr/src/place/timing_place_lookup.cpp         | 209 ------------------
 vpr/src/place/timing_place_lookup.h           |  45 ----
 vpr/src/route/router_delay_profiling.cpp      |   1 -
 vpr/src/util/vpr_utils.cpp                    |  27 +++
 vpr/src/util/vpr_utils.h                      |  21 +-
 16 files changed, 294 insertions(+), 351 deletions(-)
 create mode 100644 vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
 create mode 100644 vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
 delete mode 100644 vpr/src/place/timing_place_lookup.cpp
 delete mode 100644 vpr/src/place/timing_place_lookup.h

diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp
index 8b485916532..626b845d13a 100644
--- a/utils/route_diag/src/main.cpp
+++ b/utils/route_diag/src/main.cpp
@@ -37,7 +37,7 @@
 #include "route_export.h"
 #include "rr_graph.h"
 #include "rr_graph2.h"
-#include "timing_place_lookup.h"
+#include "compute_delta_delays_utils.h"
 
 struct t_route_util_options {
     /* Router diag tool Options */
@@ -238,36 +238,6 @@ static void profile_source(const Netlist<>& net_list,
     VTR_LOG("\n");
 }
 
-static t_chan_width setup_chan_width(const t_router_opts& router_opts,
-        t_chan_width_dist chan_width_dist) {
-    /*we give plenty of tracks, this increases routability for the */
-    /*lookup table generation */
-
-    t_graph_type graph_directionality;
-    int width_fac;
-
-    if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) {
-        auto& device_ctx = g_vpr_ctx.device();
-
-        auto type = find_most_common_tile_type(device_ctx.grid);
-
-        width_fac = 4 * type->num_pins;
-        /*this is 2x the value that binary search starts */
-        /*this should be enough to allow most pins to   */
-        /*connect to tracks in the architecture */
-    } else {
-        width_fac = router_opts.fixed_channel_width;
-    }
-
-    if (router_opts.route_type == GLOBAL) {
-        graph_directionality = GRAPH_BIDIR;
-    } else {
-        graph_directionality = GRAPH_UNIDIR;
-    }
-
-    return init_chan(width_fac, chan_width_dist, graph_directionality);
-}
-
 t_route_util_options read_route_util_options(int argc, const char** argv) {
     //Explicitly initialize for zero initialization
     t_route_util_options args = t_route_util_options();
@@ -323,17 +293,15 @@ int main(int argc, const char **argv) {
         const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist :
                                             (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist;
 
-        t_chan_width chan_width = setup_chan_width(
-                vpr_setup.RouterOpts,
-                Arch.Chans);
+        t_chan_width chan_width = setup_chan_width(vpr_setup.RouterOpts,
+                                                   Arch.Chans);
 
-        alloc_routing_structs(
-            chan_width,
-            vpr_setup.RouterOpts,
-            &vpr_setup.RoutingArch,
-            vpr_setup.Segments,
-            Arch.directs,
-            is_flat);
+        alloc_routing_structs(chan_width,
+                              vpr_setup.RouterOpts,
+                              &vpr_setup.RoutingArch,
+                              vpr_setup.Segments,
+                              Arch.directs,
+                              is_flat);
 
         if(route_options.profile_source) {
             profile_source(net_list,
diff --git a/vpr/src/base/place_and_route.cpp b/vpr/src/base/place_and_route.cpp
index ba7e20ccd80..2ffeb26c240 100644
--- a/vpr/src/base/place_and_route.cpp
+++ b/vpr/src/base/place_and_route.cpp
@@ -415,6 +415,36 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list,
     return (final);
 }
 
+t_chan_width setup_chan_width(const t_router_opts& router_opts,
+                              t_chan_width_dist chan_width_dist) {
+    /*we give plenty of tracks, this increases routability for the */
+    /*lookup table generation */
+
+    t_graph_type graph_directionality;
+    int width_fac;
+
+    if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) {
+        auto& device_ctx = g_vpr_ctx.device();
+
+        auto type = find_most_common_tile_type(device_ctx.grid);
+
+        width_fac = 4 * type->num_pins;
+        /*this is 2x the value that binary search starts */
+        /*this should be enough to allow most pins to   */
+        /*connect to tracks in the architecture */
+    } else {
+        width_fac = router_opts.fixed_channel_width;
+    }
+
+    if (router_opts.route_type == GLOBAL) {
+        graph_directionality = GRAPH_BIDIR;
+    } else {
+        graph_directionality = GRAPH_UNIDIR;
+    }
+
+    return init_chan(width_fac, chan_width_dist, graph_directionality);
+}
+
 /**
  * @brief Assigns widths to channels (in tracks).
  *
diff --git a/vpr/src/base/place_and_route.h b/vpr/src/base/place_and_route.h
index 6f191c0ff9e..538996548f2 100644
--- a/vpr/src/base/place_and_route.h
+++ b/vpr/src/base/place_and_route.h
@@ -2,11 +2,9 @@
 #define VPR_PLACE_AND_ROUTE_H
 
 #define INFINITE -1
-#define NOT_FOUND 0
 
 #define WNEED 1
 #define WL 2
-#define PROC_TIME 3
 
 #include "vpr_types.h"
 #include "timing_info.h"
@@ -18,7 +16,6 @@ struct t_fmap_cell {
     int fc;         ///<at this fc
     int wneed;      ///<need wneed to route
     int wirelength; ///<corresponding wirelength of successful routing at wneed
-    int proc_time;
     t_fmap_cell* next;
 };
 
@@ -39,6 +36,9 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list,
                                   const std::shared_ptr<RoutingDelayCalculator>& delay_calc,
                                   bool is_flat);
 
+t_chan_width setup_chan_width(const t_router_opts& router_opts,
+                              t_chan_width_dist chan_width_dist);
+
 t_chan_width init_chan(int cfactor,
                        const t_chan_width_dist& chan_width_dist,
                        t_graph_type graph_directionality);
diff --git a/vpr/src/noc/noc_routing_algorithm_creator.h b/vpr/src/noc/noc_routing_algorithm_creator.h
index 8cb9b777949..4c33d13f590 100644
--- a/vpr/src/noc/noc_routing_algorithm_creator.h
+++ b/vpr/src/noc/noc_routing_algorithm_creator.h
@@ -8,9 +8,10 @@
  * 
  * Overview
  * ========
- * There are a number of different available NoC routing algorithms. This class is a factory object for the NocRouting abstract class. This class constructs 
- * the appropriate routing algorithm based on the user specification in the
- * command line. The user identifies a 
+ * There are a number of different available NoC routing algorithms.
+ * This class is a factory object for the NocRouting abstract class.
+ * This class constructs the appropriate routing algorithm based on
+ * the user specification in the command line. The user identifies a
  * specific routing algorithm in the command line by providing a string
  * (which is the name of routing algorithm).
  * Then the corresponding routing algorithm is created here based on the 
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index 3506d00b801..69e4e1895a0 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -13,7 +13,7 @@
 #include "read_xml_arch_file.h"
 #include "echo_files.h"
 #include "histogram.h"
-#include "place_delay_model.h"
+#include "PlacementDelayModelCreator.h"
 #include "move_utils.h"
 #include "buttons.h"
 
@@ -65,14 +65,14 @@ void try_place(const Netlist<>& net_list,
 
     if (placer_opts.place_algorithm.is_timing_driven()) {
         /*do this before the initial placement to avoid messing up the initial placement */
-        place_delay_model = alloc_lookups_and_delay_model(net_list,
-                                                          chan_width_dist,
-                                                          placer_opts,
-                                                          router_opts,
-                                                          det_routing_arch,
-                                                          segment_inf,
-                                                          directs,
-                                                          is_flat);
+        place_delay_model = PlacementDelayModelCreator::create_delay_model(placer_opts,
+                                                                           router_opts,
+                                                                           net_list,
+                                                                           det_routing_arch,
+                                                                           segment_inf,
+                                                                           chan_width_dist,
+                                                                           directs,
+                                                                           is_flat);
 
         if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) {
             place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL));
diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
new file mode 100644
index 00000000000..3482cd091e0
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
@@ -0,0 +1,80 @@
+
+
+#include "PlacementDelayModelCreator.h"
+
+#include "place_delay_model.h"
+#include "simple_delay_model.h"
+#include "delta_delay_model.h"
+#include "override_delay_model.h"
+
+#include "vtr_time.h"
+#include "physical_types.h"
+#include "place_and_route.h"
+
+static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf) {
+    int length = 0;
+
+    for (const t_segment_inf& seg_info : segment_inf) {
+        if (seg_info.length > length) {
+            length = seg_info.length;
+        }
+    }
+
+    return length;
+}
+
+std::unique_ptr<PlaceDelayModel>
+PlacementDelayModelCreator::create_delay_model(const t_placer_opts& placer_opts,
+                                               const t_router_opts& router_opts,
+                                               const Netlist<>& net_list,
+                                               t_det_routing_arch* det_routing_arch,
+                                               std::vector<t_segment_inf>& segment_inf,
+                                               t_chan_width_dist chan_width_dist,
+                                               const std::vector<t_direct_inf>& directs,
+                                               bool is_flat) {
+    vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up");
+
+    t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist);
+
+    alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat);
+
+    const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch,
+                                                                          router_opts.lookahead_type,
+                                                                          router_opts.write_router_lookahead,
+                                                                          router_opts.read_router_lookahead,
+                                                                          segment_inf,
+                                                                          is_flat);
+
+    RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat);
+
+    int longest_length = get_longest_segment_length(segment_inf);
+
+    // now setup and compute the actual arrays
+    std::unique_ptr<PlaceDelayModel> place_delay_model;
+    float min_cross_layer_delay = get_min_cross_layer_delay();
+
+    if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) {
+        place_delay_model = std::make_unique<SimpleDelayModel>();
+    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
+        place_delay_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay, is_flat);
+    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
+        place_delay_model = std::make_unique<OverrideDelayModel>(min_cross_layer_delay, is_flat);
+    } else {
+        VTR_ASSERT_MSG(false, "Invalid placer delay model");
+    }
+
+    if (placer_opts.read_placement_delay_lookup.empty()) {
+        place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length);
+    } else {
+        place_delay_model->read(placer_opts.read_placement_delay_lookup);
+    }
+
+    if (!placer_opts.write_placement_delay_lookup.empty()) {
+        place_delay_model->write(placer_opts.write_placement_delay_lookup);
+    }
+
+    // free all data structures that are no longer needed
+    free_routing_structs();
+
+    return place_delay_model;
+}
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
new file mode 100644
index 00000000000..37a8e0d51c8
--- /dev/null
+++ b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
@@ -0,0 +1,31 @@
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "netlist.h"
+
+class PlaceDelayModel;
+struct t_placer_opts;
+struct t_router_opts;
+struct t_det_routing_arch;
+struct t_segment_inf;
+struct t_chan_width_dist;
+struct t_direct_inf;
+
+class PlacementDelayModelCreator {
+  public:
+    // nothing to do in the constructor and destructor
+    PlacementDelayModelCreator() = default;
+    ~PlacementDelayModelCreator() = default;
+
+    static std::unique_ptr<PlaceDelayModel> create_delay_model(const t_placer_opts& placer_opts,
+                                                               const t_router_opts& router_opts,
+                                                               const Netlist<>& net_list,
+                                                               t_det_routing_arch* det_routing_arch,
+                                                               std::vector<t_segment_inf>& segment_inf,
+                                                               t_chan_width_dist chan_width_dist,
+                                                               const std::vector<t_direct_inf>& directs,
+                                                               bool is_flat);
+};
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index 4630ddfcfb4..eb59195f055 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -5,7 +5,7 @@
 #include "vtr_math.h"
 #include "physical_types.h"
 #include "globals.h"
-#include "timing_place_lookup.h"
+#include "router_delay_profiling.h"
 
 /// Indicates the delta delay value has not been calculated
 static constexpr float UNINITIALIZED_DELTA = -1;
@@ -904,10 +904,7 @@ bool find_direct_connect_sample_locations(const t_direct_inf* direct,
     VTR_ASSERT(from_y + direct->y_offset == to_y);
     VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile);
 
-    //
-    //Find a source/sink RR node associated with the pins of the direct
-    //
-
+    // Find a source/sink RR node associated with the pins of the direct
     {
         RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class);
         VTR_ASSERT(src_rr_candidate);
@@ -921,4 +918,52 @@ bool find_direct_connect_sample_locations(const t_direct_inf* direct,
     }
 
     return true;
+}
+
+std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
+    std::vector<int> best_classes;
+
+    //Record any non-zero Fc pins
+    //
+    //Note that we track non-zero Fc pins, since certain Fc overrides
+    //may apply to only a subset of wire types. This ensures we record
+    //which pins can potentially connect to global routing.
+    std::unordered_set<int> non_zero_fc_pins;
+    for (const t_fc_specification& fc_spec : type->fc_specs) {
+        if (fc_spec.fc_value == 0) continue;
+
+        non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end());
+    }
+
+    // Collect all classes of matching type which connect to general routing
+    for (int i = 0; i < (int)type->class_inf.size(); i++) {
+        if (type->class_inf[i].type == pintype) {
+            //Check whether all pins in this class are ignored or have zero fc
+            bool any_pins_connect_to_general_routing = false;
+            for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) {
+                int pin = type->class_inf[i].pinlist[ipin];
+                //If the pin isn't ignored, and has a non-zero Fc to some general
+                //routing the class is suitable for delay profiling
+                if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) {
+                    any_pins_connect_to_general_routing = true;
+                    break;
+                }
+            }
+
+            // Skip if the pin class doesn't connect to general routing
+            if (!any_pins_connect_to_general_routing) continue;
+
+            // Record candidate class
+            best_classes.push_back(i);
+        }
+    }
+
+    // Sort classes so the largest pin class is first
+    auto cmp_class = [&](int lhs, int rhs) {
+        return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
+    };
+
+    std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class);
+
+    return best_classes;
 }
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
index bacff650334..71ac632b149 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
@@ -24,4 +24,33 @@ bool find_direct_connect_sample_locations(const t_direct_inf* direct,
                                           int to_pin,
                                           int to_pin_class,
                                           RRNodeId& out_src_node,
-                                          RRNodeId& out_sink_node);
\ No newline at end of file
+                                          RRNodeId& out_sink_node);
+
+/**
+ * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity.
+ *
+ * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`)
+ * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins
+ * that connect to general routing, ensuring commonly used pins are chosen for delay profiling.
+ *
+ * @param pintype The type of pins to filter.
+ * @param type Pointer to the physical tile type containing pin and class information.
+ *
+ * @return A vector of indices representing the selected pin classes. The classes are sorted
+ *         in descending order based on the number of pins they contain.
+ *
+ * @details
+ * - A pin class is eligible if its type matches `pintype` and it contains at least one pin
+ *   that connects to general routing (non-zero Fc).
+ * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`.
+ * - Classes are sorted so that the class with the largest number of pins appears first.
+ *   If multiple classes have the same pin count, their order depends on their initial appearance
+ *   in the architecture file.
+ *
+ * @note
+ * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded.
+ * - The function ensures stability in sorting, preserving the input order for classes
+ *   with the same number of pins.
+ */
+
+std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type);
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.cpp b/vpr/src/place/timing/delay_model/place_delay_model.cpp
index a91547a7e5e..04267e0e5f1 100644
--- a/vpr/src/place/timing/delay_model/place_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/place_delay_model.cpp
@@ -6,33 +6,11 @@
 
 #include "place_delay_model.h"
 
-#include <queue>
-
 #include "globals.h"
 #include "router_lookahead_map.h"
-#include "timing_place_lookup.h"
 #include "placer_state.h"
 #include "vpr_error.h"
 
-///@brief Initialize the placer delay model.
-std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>& net_list,
-                                                               t_chan_width_dist chan_width_dist,
-                                                               const t_placer_opts& placer_opts,
-                                                               const t_router_opts& router_opts,
-                                                               t_det_routing_arch* det_routing_arch,
-                                                               std::vector<t_segment_inf>& segment_inf,
-                                                               const std::vector<t_direct_inf>& directs,
-                                                               bool is_flat) {
-    return compute_place_delay_model(placer_opts,
-                                     router_opts,
-                                     net_list,
-                                     det_routing_arch,
-                                     segment_inf,
-                                     chan_width_dist,
-                                     directs,
-                                     is_flat);
-}
-
 /**
  * @brief Returns the delay of one point to point connection.
  *
@@ -43,7 +21,7 @@ float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
                                       const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
                                       ClusterNetId net_id,
                                       int ipin) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
 
     float delay_source_to_sink = 0.;
 
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.h b/vpr/src/place/timing/delay_model/place_delay_model.h
index e361f8cc197..27c89591071 100644
--- a/vpr/src/place/timing/delay_model/place_delay_model.h
+++ b/vpr/src/place/timing/delay_model/place_delay_model.h
@@ -29,16 +29,6 @@
 class PlaceDelayModel;
 class PlacerState;
 
-///@brief Initialize the placer delay model.
-std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>& net_list,
-                                                               t_chan_width_dist chan_width_dist,
-                                                               const t_placer_opts& place_opts,
-                                                               const t_router_opts& router_opts,
-                                                               t_det_routing_arch* det_routing_arch,
-                                                               std::vector<t_segment_inf>& segment_inf,
-                                                               const std::vector<t_direct_inf>& directs,
-                                                               bool is_flat);
-
 ///@brief Returns the delay of one point to point connection.
 float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
                                       const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
deleted file mode 100644
index f086283a3e7..00000000000
--- a/vpr/src/place/timing_place_lookup.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-
-#include "timing_place_lookup.h"
-
-#include "rr_graph_fwd.h"
-#include "vtr_assert.h"
-#include "vtr_ndmatrix.h"
-#include "vtr_log.h"
-#include "vtr_util.h"
-
-#include "vtr_time.h"
-
-#include "vpr_types.h"
-#include "globals.h"
-#include "place_and_route.h"
-#include "route_net.h"
-#include "read_xml_arch_file.h"
-#include "atom_netlist.h"
-
-#include "router_delay_profiling.h"
-#include "place_delay_model.h"
-#include "simple_delay_model.h"
-#include "delta_delay_model.h"
-#include "override_delay_model.h"
-
-
-/*** Function Prototypes *****/
-static t_chan_width setup_chan_width(const t_router_opts& router_opts,
-                                     t_chan_width_dist chan_width_dist);
-
-static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf);
-
-/******* Globally Accessible Functions **********/
-
-std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
-                                                           const t_router_opts& router_opts,
-                                                           const Netlist<>& net_list,
-                                                           t_det_routing_arch* det_routing_arch,
-                                                           std::vector<t_segment_inf>& segment_inf,
-                                                           t_chan_width_dist chan_width_dist,
-                                                           const std::vector<t_direct_inf>& directs,
-                                                           bool is_flat) {
-    vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up");
-
-    t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist);
-
-    alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat);
-
-    const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch,
-                                                                          router_opts.lookahead_type,
-                                                                          router_opts.write_router_lookahead,
-                                                                          router_opts.read_router_lookahead,
-                                                                          segment_inf,
-                                                                          is_flat);
-
-    RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat);
-
-    int longest_length = get_longest_segment_length(segment_inf);
-
-    /*now setup and compute the actual arrays */
-    std::unique_ptr<PlaceDelayModel> place_delay_model;
-    float min_cross_layer_delay = get_min_cross_layer_delay();
-
-    if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) {
-        place_delay_model = std::make_unique<SimpleDelayModel>();
-    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
-        place_delay_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay, is_flat);
-    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
-        place_delay_model = std::make_unique<OverrideDelayModel>(min_cross_layer_delay, is_flat);
-    } else {
-        VTR_ASSERT_MSG(false, "Invalid placer delay model");
-    }
-
-    if (placer_opts.read_placement_delay_lookup.empty()) {
-        place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length);
-    } else {
-        place_delay_model->read(placer_opts.read_placement_delay_lookup);
-    }
-
-    if (!placer_opts.write_placement_delay_lookup.empty()) {
-        place_delay_model->write(placer_opts.write_placement_delay_lookup);
-    }
-
-    /*free all data structures that are no longer needed */
-    free_routing_structs();
-
-    return place_delay_model;
-}
-
-/******* File Accessible Functions **********/
-
-std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
-    std::vector<int> best_classes;
-
-    //Record any non-zero Fc pins
-    //
-    //Note that we track non-zero Fc pins, since certain Fc overrides
-    //may apply to only a subset of wire types. This ensures we record
-    //which pins can potentially connect to global routing.
-    std::unordered_set<int> non_zero_fc_pins;
-    for (const t_fc_specification& fc_spec : type->fc_specs) {
-        if (fc_spec.fc_value == 0) continue;
-
-        non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end());
-    }
-
-    //Collect all classes of matching type which connect to general routing
-    for (int i = 0; i < (int)type->class_inf.size(); i++) {
-        if (type->class_inf[i].type == pintype) {
-            //Check whether all pins in this class are ignored or have zero fc
-            bool any_pins_connect_to_general_routing = false;
-            for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) {
-                int pin = type->class_inf[i].pinlist[ipin];
-                //If the pin isn't ignored, and has a non-zero Fc to some general
-                //routing the class is suitable for delay profiling
-                if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) {
-                    any_pins_connect_to_general_routing = true;
-                    break;
-                }
-            }
-
-            //Skip if the pin class doesn't connect to general routing
-            if (!any_pins_connect_to_general_routing) continue;
-
-            //Record candidate class
-            best_classes.push_back(i);
-        }
-    }
-
-    //Sort classes so the largest pin class is first
-    auto cmp_class = [&](int lhs, int rhs) {
-        return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
-    };
-
-    std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class);
-
-    return best_classes;
-}
-
-static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf) {
-    int length = 0;
-
-    for (const t_segment_inf &seg_info : segment_inf) {
-        if (seg_info.length > length) {
-            length = seg_info.length;
-        }
-    }
-
-    return length;
-}
-
-static t_chan_width setup_chan_width(const t_router_opts& router_opts,
-                                     t_chan_width_dist chan_width_dist) {
-    /*we give plenty of tracks, this increases routability for the */
-    /*lookup table generation */
-
-    t_graph_type graph_directionality;
-    int width_fac;
-
-    if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) {
-        auto& device_ctx = g_vpr_ctx.device();
-
-        auto type = find_most_common_tile_type(device_ctx.grid);
-
-        width_fac = 4 * type->num_pins;
-        /*this is 2x the value that binary search starts */
-        /*this should be enough to allow most pins to   */
-        /*connect to tracks in the architecture */
-    } else {
-        width_fac = router_opts.fixed_channel_width;
-    }
-
-    if (router_opts.route_type == GLOBAL) {
-        graph_directionality = GRAPH_BIDIR;
-    } else {
-        graph_directionality = GRAPH_UNIDIR;
-    }
-
-    return init_chan(width_fac, chan_width_dist, graph_directionality);
-}
-
-bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) {
-    //Returns true if there is a directconnect between the two RR nodes
-    //
-    //This is checked by looking for a SOURCE -> OPIN -> IPIN -> SINK path
-    //which starts at src_rr_node and ends at sink_rr_node
-    auto& device_ctx = g_vpr_ctx.device();
-    const auto& rr_graph = device_ctx.rr_graph;
-
-    VTR_ASSERT(rr_graph.node_type(src_rr_node) == SOURCE && rr_graph.node_type(sink_rr_node) == SINK);
-
-    //TODO: This is a constant depth search, but still may be too slow
-    for (t_edge_size i_src_edge = 0; i_src_edge < rr_graph.num_edges(src_rr_node); ++i_src_edge) {
-        RRNodeId opin_rr_node = rr_graph.edge_sink_node(src_rr_node, i_src_edge);
-
-        if (rr_graph.node_type(opin_rr_node) != OPIN) continue;
-
-        for (t_edge_size i_opin_edge = 0; i_opin_edge < rr_graph.num_edges(opin_rr_node); ++i_opin_edge) {
-            RRNodeId ipin_rr_node = rr_graph.edge_sink_node(opin_rr_node, i_opin_edge);
-            if (rr_graph.node_type(ipin_rr_node) != IPIN) continue;
-
-            for (t_edge_size i_ipin_edge = 0; i_ipin_edge < rr_graph.num_edges(ipin_rr_node); ++i_ipin_edge) {
-                if (sink_rr_node == rr_graph.edge_sink_node(ipin_rr_node, i_ipin_edge)) {
-                    return true;
-                }
-            }
-        }
-    }
-    return false;
-}
\ No newline at end of file
diff --git a/vpr/src/place/timing_place_lookup.h b/vpr/src/place/timing_place_lookup.h
deleted file mode 100644
index 24cfc301ce6..00000000000
--- a/vpr/src/place/timing_place_lookup.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef TIMING_PLACE_LOOKUP_H
-#define TIMING_PLACE_LOOKUP_H
-#include "place_delay_model.h"
-
-std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
-                                                           const t_router_opts& router_opts,
-                                                           const Netlist<>& net_list,
-                                                           t_det_routing_arch* det_routing_arch,
-                                                           std::vector<t_segment_inf>& segment_inf,
-                                                           t_chan_width_dist chan_width_dist,
-                                                           const std::vector<t_direct_inf>& directs,
-                                                           bool is_flat);
-
-/**
- * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity.
- *
- * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`)
- * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins
- * that connect to general routing, ensuring commonly used pins are chosen for delay profiling.
- *
- * @param pintype The type of pins to filter.
- * @param type Pointer to the physical tile type containing pin and class information.
- *
- * @return A vector of indices representing the selected pin classes. The classes are sorted
- *         in descending order based on the number of pins they contain.
- *
- * @details
- * - A pin class is eligible if its type matches `pintype` and it contains at least one pin
- *   that connects to general routing (non-zero Fc).
- * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`.
- * - Classes are sorted so that the class with the largest number of pins appears first.
- *   If multiple classes have the same pin count, their order depends on their initial appearance
- *   in the architecture file.
- *
- * @note
- * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded.
- * - The function ensures stability in sorting, preserving the input order for classes
- *   with the same number of pins.
- */
-
-std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type);
-
-bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node);
-
-#endif
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index 5feb0e9b2f6..f9c4c1d74a8 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -6,7 +6,6 @@
 #include "route_tree.h"
 #include "rr_graph.h"
 #include "vtr_time.h"
-#include "draw.h"
 
 RouterDelayProfiler::RouterDelayProfiler(const Netlist<>& net_list,
                                          const RouterLookahead* lookahead,
diff --git a/vpr/src/util/vpr_utils.cpp b/vpr/src/util/vpr_utils.cpp
index c2aa98286c0..02446c67c05 100644
--- a/vpr/src/util/vpr_utils.cpp
+++ b/vpr/src/util/vpr_utils.cpp
@@ -1857,6 +1857,33 @@ bool node_in_same_physical_tile(RRNodeId node_first, RRNodeId node_second) {
     }
 }
 
+bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    VTR_ASSERT(rr_graph.node_type(src_rr_node) == SOURCE && rr_graph.node_type(sink_rr_node) == SINK);
+
+    // A direct connection is defined as a specific path: `SOURCE -> OPIN -> IPIN -> SINK`.
+    //TODO: This is a constant depth search, but still may be too slow
+    for (t_edge_size i_src_edge = 0; i_src_edge < rr_graph.num_edges(src_rr_node); ++i_src_edge) {
+        RRNodeId opin_rr_node = rr_graph.edge_sink_node(src_rr_node, i_src_edge);
+
+        if (rr_graph.node_type(opin_rr_node) != OPIN) continue;
+
+        for (t_edge_size i_opin_edge = 0; i_opin_edge < rr_graph.num_edges(opin_rr_node); ++i_opin_edge) {
+            RRNodeId ipin_rr_node = rr_graph.edge_sink_node(opin_rr_node, i_opin_edge);
+            if (rr_graph.node_type(ipin_rr_node) != IPIN) continue;
+
+            for (t_edge_size i_ipin_edge = 0; i_ipin_edge < rr_graph.num_edges(ipin_rr_node); ++i_ipin_edge) {
+                if (sink_rr_node == rr_graph.edge_sink_node(ipin_rr_node, i_ipin_edge)) {
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
 std::vector<int> get_cluster_netlist_intra_tile_classes_at_loc(int layer,
                                                                int i,
                                                                int j,
diff --git a/vpr/src/util/vpr_utils.h b/vpr/src/util/vpr_utils.h
index 8869cc55ddd..abaafadbfe7 100644
--- a/vpr/src/util/vpr_utils.h
+++ b/vpr/src/util/vpr_utils.h
@@ -264,9 +264,28 @@ RRNodeId get_class_rr_node_id(const RRSpatialLookup& rr_spatial_lookup,
                               const int j,
                               int class_physical_num);
 
-// Check whether the given nodes are in the same cluster
+/// @brief Check whether the given nodes are in the same cluster
 bool node_in_same_physical_tile(RRNodeId node_first, RRNodeId node_second);
 
+/**
+ * @brief Checks if a direct connection exists between two RR nodes.
+ *
+ * A direct connection is defined as a specific path: `SOURCE -> OPIN -> IPIN -> SINK`.
+ *
+ * @param src_rr_node The source RR node (must be of type `SOURCE`).
+ * @param sink_rr_node The sink RR node (must be of type `SINK`).
+ *
+ * @return `true` if a direct connection exists between the source and sink nodes;
+ *         otherwise, `false`.
+ *
+ * @details
+ * - The function performs a depth-limited search starting from the source node,
+ *   traversing through OPIN, IPIN, and finally checking if the path reaches the sink node.
+ * - Ensures the specified node types are respected (e.g., source node must be of type `SOURCE`).
+ */
+
+bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node);
+
 std::vector<int> get_cluster_netlist_intra_tile_classes_at_loc(int layer,
                                                                int i,
                                                                int j,

From c08e8cfd92fac61feaeaff8e0c3ab0da7234318e Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sat, 30 Nov 2024 19:02:14 -0500
Subject: [PATCH 15/39] fix compilation errors

---
 utils/route_diag/src/main.cpp       | 8 +-------
 vpr/test/test_connection_router.cpp | 4 +---
 vpr/test/test_post_verilog.cpp      | 2 +-
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp
index 626b845d13a..61b4bb644a3 100644
--- a/utils/route_diag/src/main.cpp
+++ b/utils/route_diag/src/main.cpp
@@ -9,13 +9,10 @@
 // Tool can either perform one route between a source (--source_rr_node) and
 // a sink (--sink_rr_node), or profile a source to all tiles (set
 // --source_rr_node and "--profile_source true").
-#include <cstdio>
-#include <cstring>
-#include <ctime>
+
 #include <fstream>
 
 #include "vtr_error.h"
-#include "vtr_memory.h"
 #include "vtr_log.h"
 #include "vtr_time.h"
 
@@ -28,15 +25,12 @@
 #include "globals.h"
 
 #include "net_delay.h"
-#include "RoutingDelayCalculator.h"
 #include "place_and_route.h"
 #include "router_delay_profiling.h"
 #include "route_tree.h"
 #include "route_common.h"
 #include "route_net.h"
-#include "route_export.h"
 #include "rr_graph.h"
-#include "rr_graph2.h"
 #include "compute_delta_delays_utils.h"
 
 struct t_route_util_options {
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index a106ad80a80..2b584daedc3 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -8,7 +8,6 @@
 #include "globals.h"
 #include "net_delay.h"
 #include "place_and_route.h"
-#include "timing_place_lookup.h"
 
 static constexpr const char kArchFile[] = "../../vtr_flow/arch/timing/k6_frac_N10_mem32K_40nm.xml";
 static constexpr int kMaxHops = 10;
@@ -188,8 +187,7 @@ TEST_CASE("connection_router", "[vpr]") {
 
     // Clean up
     free_routing_structs();
-    vpr_free_all(arch,
-                 vpr_setup);
+    vpr_free_all(arch, vpr_setup);
 }
 
 } // namespace
diff --git a/vpr/test/test_post_verilog.cpp b/vpr/test/test_post_verilog.cpp
index a8344fa79d4..ca1a250b7d2 100644
--- a/vpr/test/test_post_verilog.cpp
+++ b/vpr/test/test_post_verilog.cpp
@@ -1,7 +1,7 @@
 #include "catch2/catch_test_macros.hpp"
 
 #include "vpr_api.h"
-#include "timing_place_lookup.h"
+#include "router_delay_profiling.h"
 
 #include <fstream>
 #include <memory>

From 75a765810d290808434b762cc54a8a4d1c94a7b7 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 1 Dec 2024 15:42:19 -0500
Subject: [PATCH 16/39] add find_pin() and find_pin_class() to
 t_physical_tile_type

---
 libs/libarchfpga/src/physical_types.cpp       | 50 +++++++++++++++++
 libs/libarchfpga/src/physical_types.h         | 19 +++++--
 libs/libarchfpga/src/physical_types_util.cpp  | 53 +------------------
 libs/libarchfpga/src/physical_types_util.h    | 29 ++--------
 vpr/src/base/read_options.cpp                 |  2 +-
 .../compute_delta_delays_utils.cpp            | 19 ++++---
 .../delay_model/override_delay_model.cpp      | 18 +++----
 .../timing/delay_model/override_delay_model.h |  4 +-
 vpr/src/util/vpr_utils.cpp                    |  2 +-
 9 files changed, 90 insertions(+), 106 deletions(-)

diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp
index 79619d11df4..bdacf50931d 100644
--- a/libs/libarchfpga/src/physical_types.cpp
+++ b/libs/libarchfpga/src/physical_types.cpp
@@ -136,6 +136,56 @@ bool t_physical_tile_type::is_empty() const {
     return name == std::string(EMPTY_BLOCK_NAME);
 }
 
+int t_physical_tile_type::find_pin(std::string_view port_name, int pin_index_in_port) const {
+    int ipin = OPEN;
+    int port_base_ipin = 0;
+    int num_port_pins = OPEN;
+    int pin_offset = 0;
+
+    bool port_found = false;
+    for (const t_sub_tile& sub_tile : sub_tiles) {
+        for (const t_physical_tile_port& port : sub_tile.ports) {
+            if (port_name == port.name) {
+                port_found = true;
+                num_port_pins = port.num_pins;
+                break;
+            }
+
+            port_base_ipin += port.num_pins;
+        }
+
+        if (port_found) {
+            break;
+        }
+
+        port_base_ipin = 0;
+        pin_offset += sub_tile.num_phy_pins;
+    }
+
+    if (num_port_pins != OPEN) {
+        VTR_ASSERT(pin_index_in_port < num_port_pins);
+
+        ipin = port_base_ipin + pin_index_in_port + pin_offset;
+    }
+
+    return ipin;
+}
+
+int t_physical_tile_type::find_pin_class(std::string_view port_name, int pin_index_in_port, e_pin_type pin_type) const {
+    int iclass = OPEN;
+
+    int ipin = find_pin(port_name, pin_index_in_port);
+
+    if (ipin != OPEN) {
+        iclass = pin_class[ipin];
+
+        if (iclass != OPEN) {
+            VTR_ASSERT(class_inf[iclass].type == pin_type);
+        }
+    }
+    return iclass;
+}
+
 /*
  * t_logical_block_type
  */
diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index a2fc676e305..922b1f153f8 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -704,11 +704,7 @@ struct t_physical_tile_type {
      * tile_block_pin_directs_map[logical block index][logical block pin] -> physical tile pin */
     std::unordered_map<int, std::unordered_map<int, vtr::bimap<t_logical_pin, t_physical_pin>>> tile_block_pin_directs_map;
 
-    /* Returns the indices of pins that contain a clock for this physical logic block */
-    std::vector<int> get_clock_pins_indices() const;
 
-    // Returns the sub tile location of the physical tile given an input pin
-    int get_sub_tile_loc_from_pin(int pin_num) const;
 
     // TODO: Remove is_input_type / is_output_type as part of
     // https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1193
@@ -719,8 +715,21 @@ struct t_physical_tile_type {
     // Does this t_physical_tile_type contain an outpad?
     bool is_output_type = false;
 
-    // Is this t_physical_tile_type an empty type?
+  public:   // Function members
+    ///@brief Returns the indices of pins that contain a clock for this physical logic block
+    std::vector<int> get_clock_pins_indices() const;
+
+    ///@brief Returns the sub tile location of the physical tile given an input pin
+    int get_sub_tile_loc_from_pin(int pin_num) const;
+
+    ///@brief Is this t_physical_tile_type an empty type?
     bool is_empty() const;
+
+    ///@brief Returns the relative pin index within a sub tile that corresponds to the pin within the given port and its index in the port
+    int find_pin(std::string_view port_name, int pin_index_in_port) const;
+
+    ///@brief Returns the pin class associated with the specified pin_index_in_port within the port port_name on type
+    int find_pin_class(std::string_view port_name, int pin_index_in_port, e_pin_type pin_type) const;
 };
 
 /* Holds the capacity range of a certain sub_tile block within the parent physical tile type.
diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp
index 1374a7f7055..2ecc7fbd41c 100644
--- a/libs/libarchfpga/src/physical_types_util.cpp
+++ b/libs/libarchfpga/src/physical_types_util.cpp
@@ -563,57 +563,6 @@ int get_max_num_pins(t_logical_block_type_ptr logical_block) {
     return max_num_pins;
 }
 
-//Returns the pin class associated with the specified pin_index_in_port within the port port_name on type
-int find_pin_class(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port, e_pin_type pin_type) {
-    int iclass = OPEN;
-
-    int ipin = find_pin(type, port_name, pin_index_in_port);
-
-    if (ipin != OPEN) {
-        iclass = type->pin_class[ipin];
-
-        if (iclass != OPEN) {
-            VTR_ASSERT(type->class_inf[iclass].type == pin_type);
-        }
-    }
-    return iclass;
-}
-
-int find_pin(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port) {
-    int ipin = OPEN;
-    int port_base_ipin = 0;
-    int num_pins = OPEN;
-    int pin_offset = 0;
-
-    bool port_found = false;
-    for (const auto& sub_tile : type->sub_tiles) {
-        for (const auto& port : sub_tile.ports) {
-            if (0 == strcmp(port.name, port_name.c_str())) {
-                port_found = true;
-                num_pins = port.num_pins;
-                break;
-            }
-
-            port_base_ipin += port.num_pins;
-        }
-
-        if (port_found) {
-            break;
-        }
-
-        port_base_ipin = 0;
-        pin_offset += sub_tile.num_phy_pins;
-    }
-
-    if (num_pins != OPEN) {
-        VTR_ASSERT(pin_index_in_port < num_pins);
-
-        ipin = port_base_ipin + pin_index_in_port + pin_offset;
-    }
-
-    return ipin;
-}
-
 std::pair<int, int> get_capacity_location_from_physical_pin(t_physical_tile_type_ptr physical_tile, int pin) {
     int pins_to_remove = 0;
     for (const auto& sub_tile : physical_tile->sub_tiles) {
@@ -638,7 +587,7 @@ std::pair<int, int> get_capacity_location_from_physical_pin(t_physical_tile_type
 
 int get_physical_pin_from_capacity_location(t_physical_tile_type_ptr physical_tile, int relative_pin, int capacity_location) {
     int pins_to_add = 0;
-    for (auto sub_tile : physical_tile->sub_tiles) {
+    for (const t_sub_tile& sub_tile : physical_tile->sub_tiles) {
         auto capacity = sub_tile.capacity;
         int rel_capacity = capacity_location - capacity.low;
         int num_inst_pins = sub_tile.num_phy_pins / capacity.total();
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index 8d2637ef048..94bc15e8082 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -13,11 +13,11 @@
  *  functions in this file are the following:                       *
  *    - physical_tile_type: identifies a placeable tile within      *
  *                          the device grid.                        *
- *    - logical_block_tpye: identifies a clustered block type       *
+ *    - logical_block_type: identifies a clustered block type       *
  *                          within the clb_netlist                  *
  *                                                                  *
  *  All the following utilities are intended to ease the            *
- *  developement to access the above mentioned classes and perform  *
+ *  development to access the above mentioned classes and perform   *
  *  some required operations with their data.                       *
  *                                                                  *
  *  Please classify such functions in this file                     *
@@ -107,7 +107,7 @@
  *
  * For instance, the following information are required:
  *   - mapping between logical and sub tile pins.
- *   - mapping between sub tile pins and absoulte physical pin
+ *   - mapping between sub tile pins and absolute physical pin
  *   - capacity instance of the sub tile
  *
  * With all the above information we can calculate correctly the connection between the CLK (logical pin)
@@ -173,11 +173,6 @@ std::vector<std::string> block_type_class_index_to_pin_names(t_physical_tile_typ
 ///@brief Returns the physical tile type matching a given physical tile type name, or nullptr (if not found)
 t_physical_tile_type_ptr find_tile_type_by_name(const std::string& name, const std::vector<t_physical_tile_type>& types);
 
-int find_pin_class(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port, e_pin_type pin_type);
-
-///@brief Returns the relative pin index within a sub tile that corresponds to the pin within the given port and its index in the port
-int find_pin(t_physical_tile_type_ptr type, const std::string& port_name, int pin_index_in_port);
-
 ///@brief Returns the maximum number of pins within a logical block
 int get_max_num_pins(t_logical_block_type_ptr logical_block);
 
@@ -316,12 +311,6 @@ inline bool is_class_on_tile(t_physical_tile_type_ptr physical_tile, int class_p
 
 /**
  * @brief Classes are indexed in a way that the number of classes on the same pb_graph_node is continuous
- * @param physical_tile
- * @param sub_tile
- * @param logical_block
- * @param sub_tile_relative_cap
- * @param pb_graph_node
- * @return
  */
 t_class_range get_pb_graph_node_class_physical_range(t_physical_tile_type_ptr physical_tile,
                                                      const t_sub_tile* sub_tile,
@@ -338,15 +327,11 @@ std::vector<int> get_tile_root_classes(t_physical_tile_type_ptr physical_type);
 
 /**
  * Get the number of all classes, on the tile and inside the cluster.
- * @param physical_type
- * @return
  */
 t_class_range get_flat_tile_primitive_classes(t_physical_tile_type_ptr physical_type);
 /** **/
 int get_tile_class_max_ptc(t_physical_tile_type_ptr tile, bool is_flat);
 
-/*  */
-
 /* Access information related to pins */
 
 /** get information given pin physical number **/
@@ -437,11 +422,6 @@ float get_pin_primitive_comb_delay(t_physical_tile_type_ptr physical_type,
 
 /**
  * @brief This function is used during reachability analysis to check whether two classes should be put in the same group
- * @param physical_tile
- * @param first_class_ptc_num
- * @param second_class_ptc_num
- * @param is_flat
- * @return
  */
 bool classes_in_same_block(t_physical_tile_type_ptr physical_tile,
                            int first_class_ptc_num,
@@ -451,9 +431,6 @@ bool classes_in_same_block(t_physical_tile_type_ptr physical_tile,
 /**
  * @brief Given the sink group, identify the pins which can reach both sink_ptc_num and at least one of the sinks,
  * in the grp.
- * @param physical_tile
- * @param sink_ptc_num
- * @param grp
  * @return Key is the pin number and value is the number of sinks, including sink_ptc_num, in the grp reachable by the pin
  */
 std::map<int, int> get_sink_choking_points(t_physical_tile_type_ptr physical_tile,
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 78124dd85c3..145601ac66f 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -2295,7 +2295,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .show_in(argparse::ShowIn::HELP_ONLY);
 
     place_timing_grp.add_argument(args.post_place_timing_report_file, "--post_place_timing_report")
-        .help("Name of the post-placement timing report file (not generated if unspecfied)")
+        .help("Name of the post-placement timing report file (not generated if unspecified)")
         .default_value("")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
index eb59195f055..725159406c0 100644
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
@@ -198,7 +198,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
 
     std::set<std::string> allowed_types;
     if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
-        auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
+        std::vector<std::string> allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
         allowed_types = std::set(allowed_types_vector.begin(), allowed_types_vector.end());
     }
 
@@ -206,7 +206,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
         for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) {
             vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({device_width, device_height});
 
-            //Find the lowest y location on the left edge with a non-empty block
+            // Find the lowest y location on the left edge with a non-empty block
             int y = 0;
             int x = 0;
             t_physical_tile_type_ptr src_type = nullptr;
@@ -223,7 +223,7 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                         break;
                     }
                 }
-                if (src_type) {
+                if (src_type != nullptr) {
                     break;
                 }
             }
@@ -243,10 +243,10 @@ static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_p
                                    measure_directconnect, allowed_types,
                                    is_flat);
 
-            //Find the lowest x location on the bottom edge with a non-empty block
+            // Find the lowest x location on the bottom edge with a non-empty block
             src_type = nullptr;
-            for (y = 0; y < (int)grid.height(); ++y) {
-                for (x = 0; x < (int)grid.width(); ++x) {
+            for (y = 0; y < (int)device_height; ++y) {
+                for (x = 0; x < (int)device_width; ++x) {
                     t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
 
                     if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
@@ -458,8 +458,7 @@ static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_pr
 #endif
                 }
             } else {
-                //Valid start/end
-
+                // Valid start/end
                 float delay = route_connection_delay(route_profiler,
                                                      source_x,
                                                      source_y,
@@ -553,7 +552,7 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
                 t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
                 if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
                     if (matrix[delta_x][delta_y].empty()) {
-                        //Only set empty target if we don't already have a valid delta delay
+                        // Only set empty target if we don't already have a valid delta delay
                         matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
 #ifdef VERBOSE
                         VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
@@ -575,7 +574,7 @@ static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*rou
                             continue;
 
                         if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                            //Skip if we shouldn't measure direct connects and a direct connect exists
+                            // Skip if we shouldn't measure direct connects and a direct connect exists
                             continue;
                         }
 
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
index 33106acb208..d496a43b5e7 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -24,17 +24,17 @@ void OverrideDelayModel::compute(RouterDelayProfiler& route_profiler,
 
     base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
 
-    compute_override_delay_model(route_profiler, router_opts);
+    compute_override_delay_model_(route_profiler, router_opts);
 }
 
-void OverrideDelayModel::compute_override_delay_model(RouterDelayProfiler& route_profiler,
-                                                      const t_router_opts& router_opts) {
+void OverrideDelayModel::compute_override_delay_model_(RouterDelayProfiler& route_profiler,
+                                                       const t_router_opts& router_opts) {
+    const auto& device_ctx = g_vpr_ctx.device();
     t_router_opts router_opts2 = router_opts;
     router_opts2.astar_fac = 0.f;
     router_opts2.astar_offset = 0.f;
 
-    //Look at all the direct connections that exist, and add overrides to delay model
-    auto& device_ctx = g_vpr_ctx.device();
+    // Look at all the direct connections that exist, and add overrides to delay model
     for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) {
         const t_direct_inf* direct = &device_ctx.arch->directs[idirect];
 
@@ -61,16 +61,16 @@ void OverrideDelayModel::compute_override_delay_model(RouterDelayProfiler& route
         std::set<std::pair<RRNodeId, RRNodeId>> sampled_rr_pairs;
         for (int iconn = 0; iconn < num_conns; ++iconn) {
             //Find the associated pins
-            int from_pin = find_pin(from_type, from_port.port_name(), from_port.port_low_index() + iconn);
-            int to_pin = find_pin(to_type, to_port.port_name(), to_port.port_low_index() + iconn);
+            int from_pin = from_type->find_pin(from_port.port_name(), from_port.port_low_index() + iconn);
+            int to_pin = to_type->find_pin(to_port.port_name(), to_port.port_low_index() + iconn);
 
             VTR_ASSERT(from_pin != OPEN);
             VTR_ASSERT(to_pin != OPEN);
 
-            int from_pin_class = find_pin_class(from_type, from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
+            int from_pin_class = from_type->find_pin_class(from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
             VTR_ASSERT(from_pin_class != OPEN);
 
-            int to_pin_class = find_pin_class(to_type, to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
+            int to_pin_class = to_type->find_pin_class(to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
             VTR_ASSERT(to_pin_class != OPEN);
 
             bool found_sample_points;
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.h b/vpr/src/place/timing/delay_model/override_delay_model.h
index 23f6d01d709..5965261c272 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.h
+++ b/vpr/src/place/timing/delay_model/override_delay_model.h
@@ -41,8 +41,8 @@ class OverrideDelayModel : public PlaceDelayModel {
     /// Indicates whether the router is a two-stage or run-flat
     bool is_flat_;
 
-    void compute_override_delay_model(RouterDelayProfiler& router,
-                                      const t_router_opts& router_opts);
+    void compute_override_delay_model_(RouterDelayProfiler& router,
+                                       const t_router_opts& router_opts);
 
     /**
      * @brief Structure that allows delays to be queried from the delay model.
diff --git a/vpr/src/util/vpr_utils.cpp b/vpr/src/util/vpr_utils.cpp
index 02446c67c05..430b386562f 100644
--- a/vpr/src/util/vpr_utils.cpp
+++ b/vpr/src/util/vpr_utils.cpp
@@ -708,7 +708,7 @@ InstPort parse_inst_port(const std::string& str) {
         VPR_FATAL_ERROR(VPR_ERROR_ARCH, "Failed to find block type named %s", inst_port.instance_name().c_str());
     }
 
-    int num_pins = find_tile_port_by_name(blk_type, inst_port.port_name().c_str()).num_pins;
+    int num_pins = find_tile_port_by_name(blk_type, inst_port.port_name()).num_pins;
 
     if (num_pins == OPEN) {
         VPR_FATAL_ERROR(VPR_ERROR_ARCH, "Failed to find port %s on block type %s", inst_port.port_name().c_str(), inst_port.instance_name().c_str());

From eecfde287ac033495abecc3c0f219adbf0093d84 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 1 Dec 2024 15:58:15 -0500
Subject: [PATCH 17/39] move timing_place.cpp/.h and place_timing_update.cpp/.h
 to place/timing directory

---
 libs/libarchfpga/src/physical_types.h              |  5 +----
 libs/libarchfpga/src/physical_types_util.h         | 14 +++++---------
 vpr/src/place/{ => timing}/place_timing_update.cpp |  0
 vpr/src/place/{ => timing}/place_timing_update.h   |  0
 vpr/src/place/{ => timing}/timing_place.cpp        |  0
 vpr/src/place/{ => timing}/timing_place.h          |  0
 6 files changed, 6 insertions(+), 13 deletions(-)
 rename vpr/src/place/{ => timing}/place_timing_update.cpp (100%)
 rename vpr/src/place/{ => timing}/place_timing_update.h (100%)
 rename vpr/src/place/{ => timing}/timing_place.cpp (100%)
 rename vpr/src/place/{ => timing}/timing_place.h (100%)

diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h
index 922b1f153f8..c11f1c451ee 100644
--- a/libs/libarchfpga/src/physical_types.h
+++ b/libs/libarchfpga/src/physical_types.h
@@ -24,8 +24,7 @@
  * Authors: Jason Luu and Kenneth Kent
  */
 
-#ifndef PHYSICAL_TYPES_H
-#define PHYSICAL_TYPES_H
+#pragma once
 
 #include <functional>
 #include <utility>
@@ -2157,5 +2156,3 @@ struct t_arch {
     /// Stores NoC-related architectural information when there is an embedded NoC
     t_noc_inf* noc = nullptr;
 };
-
-#endif
diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index 94bc15e8082..d4d5dc55924 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -1,5 +1,5 @@
-#ifndef PHYSICAL_TYPES_UTIL_H
-#define PHYSICAL_TYPES_UTIL_H
+
+#pragma once
 
 #include "physical_types.h"
 
@@ -152,12 +152,12 @@ int get_physical_pin_from_capacity_location(t_physical_tile_type_ptr physical_ti
  *
  * Take the above CLOCK TILE example:
  *   - given the CLOCK TILE and the index corresponding to the CLK_1 pin, we want the relative pin
- *     of one of its sub tiles at a particualr capacity location (i.e. sub tile instance).
+ *     of one of its sub tiles at a particular capacity location (i.e. sub tile instance).
  *
  * std::tie(absolute_capacity, relative_pin) = get_capacity_location_from_physical_pin(clock_tile, 3)
  *
  * The value returned is (1, 0), where:
- *   - 1 corresponds to the capacity location (sub tile instance) where the absoulte physical pin index (CLK_1) is connected
+ *   - 1 corresponds to the capacity location (sub tile instance) where the absolute physical pin index (CLK_1) is connected
  *   - 0 corresponds to the relative pin index within the BUFGCTRL sub tile
  */
 std::pair<int, int> get_capacity_location_from_physical_pin(t_physical_tile_type_ptr physical_tile, int pin);
@@ -223,7 +223,7 @@ int get_physical_pin(t_physical_tile_type_ptr physical_tile,
                      int pin);
 /**
  * @brief Returns the physical pin index (within 'physical_tile') corresponding to the
- * logical index ('pin' of the first instance of 'logical_block' within the physcial tile.
+ * logical index ('pin' of the first instance of 'logical_block' within the physical tile.
  * This function considers if a given offset is in the range of sub tile capacity
  *
  *   (First pin index at current sub-tile)                                     (The wanted pin index)
@@ -436,7 +436,3 @@ bool classes_in_same_block(t_physical_tile_type_ptr physical_tile,
 std::map<int, int> get_sink_choking_points(t_physical_tile_type_ptr physical_tile,
                                            int sink_ptc_num,
                                            const std::vector<int>& grp);
-
-/* */
-
-#endif
diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/timing/place_timing_update.cpp
similarity index 100%
rename from vpr/src/place/place_timing_update.cpp
rename to vpr/src/place/timing/place_timing_update.cpp
diff --git a/vpr/src/place/place_timing_update.h b/vpr/src/place/timing/place_timing_update.h
similarity index 100%
rename from vpr/src/place/place_timing_update.h
rename to vpr/src/place/timing/place_timing_update.h
diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing/timing_place.cpp
similarity index 100%
rename from vpr/src/place/timing_place.cpp
rename to vpr/src/place/timing/timing_place.cpp
diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing/timing_place.h
similarity index 100%
rename from vpr/src/place/timing_place.h
rename to vpr/src/place/timing/timing_place.h

From 4aa3d744ca7825541866d2eef207728f9e985fb7 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 1 Dec 2024 16:55:02 -0500
Subject: [PATCH 18/39] add files for PlacerSetupSlacks and PlacerCriticalities

---
 vpr/src/place/annealer.cpp                    |   2 +
 vpr/src/place/annealer.h                      |   1 +
 .../place/move_generators/move_generator.h    |   2 +-
 vpr/src/place/net_cost_handler.cpp            |   1 +
 vpr/src/place/net_cost_handler.h              |   1 +
 vpr/src/place/place_checkpoint.cpp            |   4 +
 vpr/src/place/placer.h                        |   3 +
 ...ming_place.cpp => PlacerCriticalities.cpp} | 124 +--------
 vpr/src/place/timing/PlacerCriticalities.h    | 160 +++++++++++
 vpr/src/place/timing/PlacerSetupSlacks.cpp    | 109 ++++++++
 vpr/src/place/timing/PlacerSetupSlacks.h      | 108 ++++++++
 vpr/src/place/timing/place_timing_update.cpp  |   9 +-
 vpr/src/place/timing/place_timing_update.h    |  11 +-
 vpr/src/place/timing/timing_place.h           | 261 +-----------------
 14 files changed, 414 insertions(+), 382 deletions(-)
 rename vpr/src/place/timing/{timing_place.cpp => PlacerCriticalities.cpp} (60%)
 create mode 100644 vpr/src/place/timing/PlacerCriticalities.h
 create mode 100644 vpr/src/place/timing/PlacerSetupSlacks.cpp
 create mode 100644 vpr/src/place/timing/PlacerSetupSlacks.h

diff --git a/vpr/src/place/annealer.cpp b/vpr/src/place/annealer.cpp
index b18f60b27bd..42fd3356709 100644
--- a/vpr/src/place/annealer.cpp
+++ b/vpr/src/place/annealer.cpp
@@ -16,6 +16,8 @@
 #include "read_place.h"
 #include "placer_breakpoint.h"
 #include "RL_agent_util.h"
+#include "PlacerSetupSlacks.h"
+#include "PlacerCriticalities.h"
 
 /**************************************************************************/
 /*************** Static Function Declarations *****************************/
diff --git a/vpr/src/place/annealer.h b/vpr/src/place/annealer.h
index fd9b0dbd928..f788aea666d 100644
--- a/vpr/src/place/annealer.h
+++ b/vpr/src/place/annealer.h
@@ -17,6 +17,7 @@ enum class e_agent_state;
 
 class NocCostHandler;
 class NetPinTimingInvalidator;
+class PlacerSetupSlacks;
 
 /**
  * These variables keep track of the number of swaps
diff --git a/vpr/src/place/move_generators/move_generator.h b/vpr/src/place/move_generators/move_generator.h
index e39493e16c6..5ca0b4ce1f5 100644
--- a/vpr/src/place/move_generators/move_generator.h
+++ b/vpr/src/place/move_generators/move_generator.h
@@ -3,7 +3,7 @@
 
 #include "vpr_types.h"
 #include "move_utils.h"
-#include "timing_place.h"
+#include "PlacerCriticalities.h"
 
 #include <limits>
 
diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp
index ac049995347..e2a8e902e31 100644
--- a/vpr/src/place/net_cost_handler.cpp
+++ b/vpr/src/place/net_cost_handler.cpp
@@ -34,6 +34,7 @@
 #include "vtr_math.h"
 #include "vtr_ndmatrix.h"
 #include "vtr_ndoffsetmatrix.h"
+#include "PlacerCriticalities.h"
 
 #include <array>
 
diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h
index 2b8e59af88f..6436265dbda 100644
--- a/vpr/src/place/net_cost_handler.h
+++ b/vpr/src/place/net_cost_handler.h
@@ -15,6 +15,7 @@
 #include <functional>
 
 class PlacerState;
+class PlacerCriticalities;
 
 /**
  * @brief The method used to calculate placement cost
diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp
index 60b009d85ae..a6e2858e577 100644
--- a/vpr/src/place/place_checkpoint.cpp
+++ b/vpr/src/place/place_checkpoint.cpp
@@ -1,7 +1,11 @@
+
 #include "place_checkpoint.h"
+
 #include "noc_place_utils.h"
 #include "placer_state.h"
 #include "grid_block.h"
+#include "PlacerCriticalities.h"
+#include "PlacerSetupSlacks.h"
 
 float t_placement_checkpoint::get_cp_cpd() const { return cpd_; }
 
diff --git a/vpr/src/place/placer.h b/vpr/src/place/placer.h
index 99c00d7e8e5..11924314c8b 100644
--- a/vpr/src/place/placer.h
+++ b/vpr/src/place/placer.h
@@ -27,6 +27,9 @@
 #include "noc_place_utils.h"
 #include "net_cost_handler.h"
 #include "placement_log_printer.h"
+#include "PlacerSetupSlacks.h"
+#include "PlacerCriticalities.h"
+#include "NetPinTimingInvalidator.h"
 
 class PlacementAnnealer;
 namespace vtr{
diff --git a/vpr/src/place/timing/timing_place.cpp b/vpr/src/place/timing/PlacerCriticalities.cpp
similarity index 60%
rename from vpr/src/place/timing/timing_place.cpp
rename to vpr/src/place/timing/PlacerCriticalities.cpp
index badd9d1fb61..8aa248abab6 100644
--- a/vpr/src/place/timing/timing_place.cpp
+++ b/vpr/src/place/timing/PlacerCriticalities.cpp
@@ -1,19 +1,9 @@
-/**
- * @file timing_place.cpp
- * @brief Stores the method definitions of classes defined in timing_place.h.
- */
-
-#include <cmath>
 
-#include "vtr_util.h"
-
-#include "vpr_types.h"
-#include "vpr_utils.h"
-#include "net_delay.h"
-#include "timing_place.h"
-#include "placer_state.h"
+#include "PlacerCriticalities.h"
 
 #include "timing_info.h"
+#include "timing_util.h"
+#include "placer_state.h"
 
 ///@brief Allocates space for the timing_place_crit_ data structure.
 PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist,
@@ -161,110 +151,4 @@ void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float c
  */
 PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const {
     return vtr::make_range(cluster_pins_with_modified_criticality_);
-}
-
-/**************************************/
-
-///@brief Allocates space for the timing_place_setup_slacks_ data structure.
-PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
-                                     const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                                     std::shared_ptr<const SetupTimingInfo> timing_info)
-    : clb_nlist_(clb_nlist)
-    , pin_lookup_(netlist_pin_lookup)
-    , timing_info_(std::move(timing_info))
-    , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
-}
-
-/**
- * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure.
- *
- * If the setup slacks are not updated immediately after each time we call
- * timing_info->update(), then timing_info->pins_with_modified_setup_slack()
- * cannot accurately account for all the pins that need to be updated.
- *
- * In this case, `recompute_required` would be true, and we update all setup slacks
- * from scratch.
- */
-void PlacerSetupSlacks::update_setup_slacks() {
-    /* If update is not enabled, exit the routine. */
-    if (!update_enabled) {
-        /* re-computation is required on the next iteration */
-        recompute_required = true;
-        return;
-    }
-
-    /* Determine what pins need updating */
-    if (!recompute_required) {
-        incr_update_setup_slacks();
-    } else {
-        recompute_setup_slacks();
-    }
-
-    /* Update the affected pins */
-    for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) {
-        ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
-        int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
-
-        float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info_, pin_lookup_, clb_pin);
-
-        timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack;
-    }
-
-    /* Setup slacks updated. In sync with timing info.     */
-    /* Can be incrementally updated on the next iteration. */
-    recompute_required = false;
-}
-
-/**
- * @brief Collect the cluster pins which need to be updated based on the latest timing
- *        analysis so that incremental updates to setup slacks can be performed.
- *
- * Note we use the set of pins reported by the *timing_info* as having modified
- * setup slacks, rather than those marked as modified by the timing analyzer.
- */
-void PlacerSetupSlacks::incr_update_setup_slacks() {
-    cluster_pins_with_modified_setup_slack_.clear();
-
-    for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_slack()) {
-        ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
-
-        //Some atom pins correspond to connections which are completely
-        //contained within a cluster, and hence have no corresponding
-        //clustered pin.
-        if (!clb_pin) continue;
-
-        cluster_pins_with_modified_setup_slack_.insert(clb_pin);
-    }
-}
-
-/**
- * @brief Collect all the sink pins in the netlist and prepare them update.
- *
- * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks().
- */
-void PlacerSetupSlacks::recompute_setup_slacks() {
-    cluster_pins_with_modified_setup_slack_.clear();
-
-    /* Non-incremental: all sink pins need updating */
-    for (ClusterNetId net_id : clb_nlist_.nets()) {
-        for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
-            cluster_pins_with_modified_setup_slack_.insert(pin_id);
-        }
-    }
-}
-
-///@brief Override the setup slack of a particular connection.
-void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) {
-    VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)");
-    VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout");
-
-    timing_place_setup_slacks_[net_id][ipin] = slack_val;
-}
-
-/**
- * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
- *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
- */
-PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const {
-    return vtr::make_range(cluster_pins_with_modified_setup_slack_);
-}
+}
\ No newline at end of file
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
new file mode 100644
index 00000000000..7f7a1975ff2
--- /dev/null
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -0,0 +1,160 @@
+
+#pragma once
+
+#include "vtr_vec_id_set.h"
+#include "timing_info_fwd.h"
+#include "clustered_netlist_utils.h"
+#include "place_delay_model.h"
+#include "vpr_net_pins_matrix.h"
+
+/**
+ * @brief Saves the placement criticality parameters
+ *
+ * crit_exponent: The criticality exponent used to sharpen the criticalities
+ * crit_limit:    The limit to consider a pin as timing critical
+ */
+struct PlaceCritParams {
+    float crit_exponent;
+    float crit_limit;
+};
+
+/**
+ * @brief PlacerCriticalities returns the clustered netlist connection criticalities
+ *        used by the placer ('sharpened' by a criticality exponent).
+ *
+ * Usage
+ * =====
+ * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds)
+ * to the clustered netlist (i.e. ClusterPinIds) used during placement.
+ *
+ * Criticalities are updated by update_criticalities(), given that `update_enabled` is
+ * set to true. It will update criticalities based on the atom netlist connection
+ * criticalities provided by the passed in SetupTimingInfo.
+ *
+ * This process can be done incrementally, based on the modified connections/AtomPinIds
+ * returned by SetupTimingInfo. However, the set returned only reflects the connections
+ * changed by the last call to the timing info update.
+ *
+ * Therefore, if SetupTimingInfo is updated twice in succession without criticalities
+ * getting updated (update_enabled = false), the returned set cannot account for all
+ * the connections that have been modified. In this case, we flag `recompute_required`
+ * as false, and we recompute the criticalities for every connection to ensure that
+ * they are all up to date. Hence, each time update_setup_slacks_and_criticalities()
+ * is called, we assign `recompute_required` the opposite value of `update_enabled`.
+ *
+ * This class also maps/transforms the modified atom connections/pins returned by the
+ * timing info into modified clustered netlist connections/pins after calling
+ * update_criticalities(). The interface then enables users to iterate over this range
+ * via pins_with_modified_criticalities(). This is useful for incrementally re-calculating
+ * the timing costs.
+ *
+ * The criticalities of individual connections can then be queried by calling the
+ * criticality() member function.
+ *
+ * Implementation
+ * ==============
+ * To support incremental re-calculation, the class saves the last criticality exponent
+ * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same
+ * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated
+ * from scratch, since a change in exponent changes *all* criticalities.
+ */
+class PlacerCriticalities {
+  public: //Types
+    typedef vtr::vec_id_set<ClusterPinId>::iterator pin_iterator;
+    typedef vtr::vec_id_set<ClusterNetId>::iterator net_iterator;
+
+    typedef vtr::Range<pin_iterator> pin_range;
+    typedef vtr::Range<net_iterator> net_range;
+
+  public: //Lifetime
+    PlacerCriticalities(const ClusteredNetlist& clb_nlist,
+                        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+                        std::shared_ptr<const SetupTimingInfo> timing_info);
+    PlacerCriticalities(const PlacerCriticalities&) = delete;
+    PlacerCriticalities& operator=(const PlacerCriticalities&) = delete;
+
+  public: //Accessors
+    ///@brief Returns the criticality of the specified connection.
+    float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; }
+
+    /**
+     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which
+     *        were modified by the last call to PlacerCriticalities::update_criticalities().
+     */
+    pin_range pins_with_modified_criticality() const;
+
+  public: //Modifiers
+    /**
+     * @brief Updates criticalities based on the atom netlist criticalitites
+     *        provided by timing_info and the provided criticality_exponent.
+     *
+     * Should consistently call this method after the most recent timing analysis to
+     * keep the criticalities stored in this class in sync with the timing analyzer.
+     * If out of sync, then the criticalities cannot be incrementally updated on
+     * during the next timing analysis iteration.
+     */
+    void update_criticalities(const PlaceCritParams& crit_params,
+                              PlacerState& placer_state);
+
+    ///@bried Enable the recompute_required flag to enforce from scratch update.
+    void set_recompute_required();
+
+    ///@brief From scratch update. See timing_place.cpp for more.
+    void recompute_criticalities();
+
+    ///@brief Override the criticality of a particular connection.
+    void set_criticality(ClusterNetId net, int ipin, float crit_val);
+
+    ///@brief Set `update_enabled` to true.
+    void enable_update() { update_enabled = true; }
+
+    ///@brief Set `update_enabled` to true.
+    void disable_update() { update_enabled = false; }
+
+  private: //Data
+    ///@brief The clb netlist in the placement context.
+    const ClusteredNetlist& clb_nlist_;
+
+    ///@brief The lookup table that maps atom pins to clb pins.
+    const ClusteredPinAtomPinsLookup& pin_lookup_;
+
+    ///@brief A pointer to the setup timing analyzer
+    std::shared_ptr<const SetupTimingInfo> timing_info_;
+
+    /**
+     * @brief The matrix that stores criticality value for each connection.
+     *
+     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+     */
+    ClbNetPinsMatrix<float> timing_place_crit_;
+
+    /**
+     * The criticality exponent when update_criticalites() was last called
+     * (used to detect if incremental update can be used).
+     */
+    float last_crit_exponent_ = std::numeric_limits<float>::quiet_NaN();
+
+    ///@brief Set of pins with criticaltites modified by last call to update_criticalities().
+    vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_criticality_;
+
+    ///@brief Incremental update. See timing_place.cpp for more.
+    void incr_update_criticalities();
+
+    ///@brief Flag that turns on/off the update_criticalities() routine.
+    bool update_enabled = true;
+
+    /**
+     * @brief Flag that checks if criticalities need to be recomputed for all connections.
+     *
+     * Used by the method update_criticalities(). They incremental update is not possible
+     * if this method wasn't called updated after the previous timing info update.
+     */
+    bool recompute_required = true;
+
+    /**
+     * @brief if this is first time to call update_criticality
+     *
+     * This can be used for incremental criticality update and also incrementally update the highly critical pins
+     */
+    bool first_time_update_criticality = true;
+};
diff --git a/vpr/src/place/timing/PlacerSetupSlacks.cpp b/vpr/src/place/timing/PlacerSetupSlacks.cpp
new file mode 100644
index 00000000000..ffc637f423b
--- /dev/null
+++ b/vpr/src/place/timing/PlacerSetupSlacks.cpp
@@ -0,0 +1,109 @@
+
+#include "PlacerSetupSlacks.h"
+
+#include "timing_util.h"
+#include "timing_info.h"
+
+///@brief Allocates space for the timing_place_setup_slacks_ data structure.
+PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
+                                     const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+                                     std::shared_ptr<const SetupTimingInfo> timing_info)
+    : clb_nlist_(clb_nlist)
+    , pin_lookup_(netlist_pin_lookup)
+    , timing_info_(std::move(timing_info))
+    , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
+}
+
+/**
+ * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure.
+ *
+ * If the setup slacks are not updated immediately after each time we call
+ * timing_info->update(), then timing_info->pins_with_modified_setup_slack()
+ * cannot accurately account for all the pins that need to be updated.
+ *
+ * In this case, `recompute_required` would be true, and we update all setup slacks
+ * from scratch.
+ */
+void PlacerSetupSlacks::update_setup_slacks() {
+    /* If update is not enabled, exit the routine. */
+    if (!update_enabled) {
+        /* re-computation is required on the next iteration */
+        recompute_required = true;
+        return;
+    }
+
+    /* Determine what pins need updating */
+    if (!recompute_required) {
+        incr_update_setup_slacks();
+    } else {
+        recompute_setup_slacks();
+    }
+
+    /* Update the affected pins */
+    for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) {
+        ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
+        int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
+
+        float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info_, pin_lookup_, clb_pin);
+
+        timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack;
+    }
+
+    /* Setup slacks updated. In sync with timing info.     */
+    /* Can be incrementally updated on the next iteration. */
+    recompute_required = false;
+}
+
+/**
+ * @brief Collect the cluster pins which need to be updated based on the latest timing
+ *        analysis so that incremental updates to setup slacks can be performed.
+ *
+ * Note we use the set of pins reported by the *timing_info* as having modified
+ * setup slacks, rather than those marked as modified by the timing analyzer.
+ */
+void PlacerSetupSlacks::incr_update_setup_slacks() {
+    cluster_pins_with_modified_setup_slack_.clear();
+
+    for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_slack()) {
+        ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
+
+        //Some atom pins correspond to connections which are completely
+        //contained within a cluster, and hence have no corresponding
+        //clustered pin.
+        if (!clb_pin) continue;
+
+        cluster_pins_with_modified_setup_slack_.insert(clb_pin);
+    }
+}
+
+/**
+ * @brief Collect all the sink pins in the netlist and prepare them update.
+ *
+ * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks().
+ */
+void PlacerSetupSlacks::recompute_setup_slacks() {
+    cluster_pins_with_modified_setup_slack_.clear();
+
+    /* Non-incremental: all sink pins need updating */
+    for (ClusterNetId net_id : clb_nlist_.nets()) {
+        for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
+            cluster_pins_with_modified_setup_slack_.insert(pin_id);
+        }
+    }
+}
+
+///@brief Override the setup slack of a particular connection.
+void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) {
+    VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)");
+    VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout");
+
+    timing_place_setup_slacks_[net_id][ipin] = slack_val;
+}
+
+/**
+ * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
+ *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
+ */
+PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const {
+    return vtr::make_range(cluster_pins_with_modified_setup_slack_);
+}
diff --git a/vpr/src/place/timing/PlacerSetupSlacks.h b/vpr/src/place/timing/PlacerSetupSlacks.h
new file mode 100644
index 00000000000..580a26db2c2
--- /dev/null
+++ b/vpr/src/place/timing/PlacerSetupSlacks.h
@@ -0,0 +1,108 @@
+
+#pragma once
+
+#include "vtr_vec_id_set.h"
+#include "timing_info_fwd.h"
+#include "clustered_netlist_utils.h"
+#include "place_delay_model.h"
+#include "vpr_net_pins_matrix.h"
+
+/**
+ * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection.
+ *
+ * Usage
+ * =====
+ * This class mirrors PlacerCriticalities by both its methods and its members. The only
+ * difference is that this class deals with RAW setup slacks returned by SetupTimingInfo
+ * rather than criticalities. See the documentation on PlacerCriticalities for more.
+ *
+ * RAW setup slacks are unlike criticalities. Their values are not confined between
+ * 0 and 1. Their values can be either positive or negative.
+ *
+ * This class also provides iterating over the clustered netlist connections/pins that
+ * have modified setup slacks by the last call to update_setup_slacks(). However, this
+ * utility is mainly used for incrementally committing the setup slack values into the
+ * structure `connection_setup_slack` used by many placer routines.
+ */
+class PlacerSetupSlacks {
+  public: //Types
+    typedef vtr::vec_id_set<ClusterPinId>::iterator pin_iterator;
+    typedef vtr::vec_id_set<ClusterNetId>::iterator net_iterator;
+
+    typedef vtr::Range<pin_iterator> pin_range;
+    typedef vtr::Range<net_iterator> net_range;
+
+  public: //Lifetime
+    PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
+                      const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+                      std::shared_ptr<const SetupTimingInfo> timing_info);
+    PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete;
+    PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete;
+
+  public: //Accessors
+    ///@brief Returns the setup slack of the specified connection.
+    float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; }
+
+    /**
+     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
+     *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
+     */
+    pin_range pins_with_modified_setup_slack() const;
+
+  public: //Modifiers
+    /**
+     * @brief Updates setup slacks based on the atom netlist setup slacks provided
+     *        by timing_info_.
+     *
+     * Should consistently call this method after the most recent timing analysis to
+     * keep the setup slacks stored in this class in sync with the timing analyzer.
+     * If out of sync, then the setup slacks cannot be incrementally updated on
+     * during the next timing analysis iteration.
+     */
+    void update_setup_slacks();
+
+    ///@bried Enable the recompute_required flag to enforce from scratch update.
+    void set_recompute_required() { recompute_required = true; }
+
+    ///@brief Override the setup slack of a particular connection.
+    void set_setup_slack(ClusterNetId net, int ipin, float slack_val);
+
+    ///@brief Set `update_enabled` to true.
+    void enable_update() { update_enabled = true; }
+
+    ///@brief Set `update_enabled` to true.
+    void disable_update() { update_enabled = false; }
+
+  private: //Data
+    const ClusteredNetlist& clb_nlist_;
+    const ClusteredPinAtomPinsLookup& pin_lookup_;
+    std::shared_ptr<const SetupTimingInfo> timing_info_;
+
+    /**
+     * @brief The matrix that stores raw setup slack values for each connection.
+     *
+     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+     */
+    ClbNetPinsMatrix<float> timing_place_setup_slacks_;
+
+    ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks()
+    vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_setup_slack_;
+
+    ///@brief Incremental update. See timing_place.cpp for more.
+    void incr_update_setup_slacks();
+
+    ///@brief Incremental update. See timing_place.cpp for more.
+    void recompute_setup_slacks();
+
+    ///@brief Flag that turns on/off the update_setup_slacks() routine.
+    bool update_enabled = true;
+
+    /**
+     * @brief Flag that checks if setup slacks need to be recomputed for all connections.
+     *
+     * Used by the method update_setup_slacks(). They incremental update is not possible
+     * if this method wasn't called updated after the previous timing info update.
+     */
+    bool recompute_required = true;
+};
+
diff --git a/vpr/src/place/timing/place_timing_update.cpp b/vpr/src/place/timing/place_timing_update.cpp
index c9c53b88f90..00cad07da7b 100644
--- a/vpr/src/place/timing/place_timing_update.cpp
+++ b/vpr/src/place/timing/place_timing_update.cpp
@@ -3,10 +3,15 @@
  * @brief Defines the routines declared in place_timing_update.h.
  */
 
-#include "vtr_time.h"
-
 #include "place_timing_update.h"
+
+#include "NetPinTimingInvalidator.h"
+#include "PlacerCriticalities.h"
+#include "PlacerSetupSlacks.h"
 #include "placer_state.h"
+#include "place_util.h"
+#include "vtr_time.h"
+
 
 /* Routines local to place_timing_update.cpp */
 static double comp_td_connection_cost(const PlaceDelayModel* delay_model,
diff --git a/vpr/src/place/timing/place_timing_update.h b/vpr/src/place/timing/place_timing_update.h
index 7944c4a7552..6ced93e4487 100644
--- a/vpr/src/place/timing/place_timing_update.h
+++ b/vpr/src/place/timing/place_timing_update.h
@@ -4,10 +4,15 @@
  */
 
 #pragma once
-#include "timing_place.h"
-#include "place_util.h"
 
-#include "NetPinTimingInvalidator.h"
+class PlacerState;
+class PlaceCritParams;
+class PlacerCriticalities;
+class PlacerSetupSlacks;
+class NetPinTimingInvalidator;
+class PlaceDelayModel;
+class SetupTimingInfo;
+struct t_placer_costs;
 
 ///@brief Initialize the timing information and structures in the placer.
 void initialize_timing_info(const PlaceCritParams& crit_params,
diff --git a/vpr/src/place/timing/timing_place.h b/vpr/src/place/timing/timing_place.h
index 71e144334ad..bd85061065f 100644
--- a/vpr/src/place/timing/timing_place.h
+++ b/vpr/src/place/timing/timing_place.h
@@ -8,7 +8,7 @@
  *              range from negative to positive values. Also maps
  *              atom pin setup slacks to clb pin setup slacks.
  *   @class PlacerCriticalities
- *              Query connection criticalities, which are calculuated
+ *              Query connection criticalities, which are calculated
  *              based on the raw setup slacks and ranges from 0 to 1.
  *              Also maps atom pin crit. to clb pin crit.
  *   @class PlacerTimingCosts
@@ -41,257 +41,6 @@
 #include "place_delay_model.h"
 #include "vpr_net_pins_matrix.h"
 
-/**
- * @brief Saves the placement criticality parameters
- *
- * crit_exponent: The criticality exponent used to sharpen the criticalities
- * crit_limit:    The limit to consider a pin as timing critical
- */
-struct PlaceCritParams {
-    float crit_exponent;
-    float crit_limit;
-};
-
-/**
- * @brief PlacerCriticalities returns the clustered netlist connection criticalities
- *        used by the placer ('sharpened' by a criticality exponent).
- *
- * Usage
- * =====
- * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds)
- * to the clustered netlist (i.e. ClusterPinIds) used during placement.
- *
- * Criticalities are updated by update_criticalities(), given that `update_enabled` is
- * set to true. It will update criticalities based on the atom netlist connection
- * criticalities provided by the passed in SetupTimingInfo.
- *
- * This process can be done incrementally, based on the modified connections/AtomPinIds
- * returned by SetupTimingInfo. However, the set returned only reflects the connections
- * changed by the last call to the timing info update.
- *
- * Therefore, if SetupTimingInfo is updated twice in succession without criticalities
- * getting updated (update_enabled = false), the returned set cannot account for all
- * the connections that have been modified. In this case, we flag `recompute_required`
- * as false, and we recompute the criticalities for every connection to ensure that
- * they are all up to date. Hence, each time update_setup_slacks_and_criticalities()
- * is called, we assign `recompute_required` the opposite value of `update_enabled`.
- *
- * This class also maps/transforms the modified atom connections/pins returned by the
- * timing info into modified clustered netlist connections/pins after calling
- * update_criticalities(). The interface then enables users to iterate over this range
- * via pins_with_modified_criticalities(). This is useful for incrementally re-calculating
- * the timing costs.
- *
- * The criticalities of individual connections can then be queried by calling the
- * criticality() member function.
- *
- * Implementation
- * ==============
- * To support incremental re-calculation, the class saves the last criticality exponent
- * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same
- * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated
- * from scratch, since a change in exponent changes *all* criticalities.
- */
-class PlacerCriticalities {
-  public: //Types
-    typedef vtr::vec_id_set<ClusterPinId>::iterator pin_iterator;
-    typedef vtr::vec_id_set<ClusterNetId>::iterator net_iterator;
-
-    typedef vtr::Range<pin_iterator> pin_range;
-    typedef vtr::Range<net_iterator> net_range;
-
-  public: //Lifetime
-    PlacerCriticalities(const ClusteredNetlist& clb_nlist,
-                        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                        std::shared_ptr<const SetupTimingInfo> timing_info);
-    PlacerCriticalities(const PlacerCriticalities&) = delete;
-    PlacerCriticalities& operator=(const PlacerCriticalities&) = delete;
-
-  public: //Accessors
-    ///@brief Returns the criticality of the specified connection.
-    float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; }
-
-    /**
-     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which
-     *        were modified by the last call to PlacerCriticalities::update_criticalities().
-     */
-    pin_range pins_with_modified_criticality() const;
-
-  public: //Modifiers
-    /**
-     * @brief Updates criticalities based on the atom netlist criticalitites
-     *        provided by timing_info and the provided criticality_exponent.
-     *
-     * Should consistently call this method after the most recent timing analysis to
-     * keep the criticalities stored in this class in sync with the timing analyzer.
-     * If out of sync, then the criticalities cannot be incrementally updated on
-     * during the next timing analysis iteration.
-     */
-    void update_criticalities(const PlaceCritParams& crit_params,
-                              PlacerState& placer_state);
-
-    ///@bried Enable the recompute_required flag to enforce from scratch update.
-    void set_recompute_required();
-
-    ///@brief From scratch update. See timing_place.cpp for more.
-    void recompute_criticalities();
-
-    ///@brief Override the criticality of a particular connection.
-    void set_criticality(ClusterNetId net, int ipin, float crit_val);
-
-    ///@brief Set `update_enabled` to true.
-    void enable_update() { update_enabled = true; }
-
-    ///@brief Set `update_enabled` to true.
-    void disable_update() { update_enabled = false; }
-
-  private: //Data
-    ///@brief The clb netlist in the placement context.
-    const ClusteredNetlist& clb_nlist_;
-
-    ///@brief The lookup table that maps atom pins to clb pins.
-    const ClusteredPinAtomPinsLookup& pin_lookup_;
-
-    ///@brief A pointer to the setup timing analyzer
-    std::shared_ptr<const SetupTimingInfo> timing_info_;
-
-    /**
-     * @brief The matrix that stores criticality value for each connection.
-     *
-     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
-     */
-    ClbNetPinsMatrix<float> timing_place_crit_;
-
-    /**
-     * The criticality exponent when update_criticalites() was last called
-     * (used to detect if incremental update can be used).
-     */
-    float last_crit_exponent_ = std::numeric_limits<float>::quiet_NaN();
-
-    ///@brief Set of pins with criticaltites modified by last call to update_criticalities().
-    vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_criticality_;
-
-    ///@brief Incremental update. See timing_place.cpp for more.
-    void incr_update_criticalities();
-
-    ///@brief Flag that turns on/off the update_criticalities() routine.
-    bool update_enabled = true;
-
-    /**
-     * @brief Flag that checks if criticalities need to be recomputed for all connections.
-     *
-     * Used by the method update_criticalities(). They incremental update is not possible
-     * if this method wasn't called updated after the previous timing info update.
-     */
-    bool recompute_required = true;
-
-    /**
-     * @brief if this is first time to call update_criticality
-     * 
-     * This can be used for incremental criticality update and also incrementally update the highly critical pins
-     */
-    bool first_time_update_criticality = true;
-};
-
-/**
- * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection.
- *
- * Usage
- * =====
- * This class mirrors PlacerCriticalities by both its methods and its members. The only
- * difference is that this class deals with RAW setup slacks returned by SetupTimingInfo
- * rather than criticalities. See the documentation on PlacerCriticalities for more.
- *
- * RAW setup slacks are unlike criticalities. Their values are not confined between
- * 0 and 1. Their values can be either positive or negative.
- *
- * This class also provides iterating over the clustered netlist connections/pins that
- * have modified setup slacks by the last call to update_setup_slacks(). However, this
- * utility is mainly used for incrementally committing the setup slack values into the
- * structure `connection_setup_slack` used by many placer routines.
- */
-class PlacerSetupSlacks {
-  public: //Types
-    typedef vtr::vec_id_set<ClusterPinId>::iterator pin_iterator;
-    typedef vtr::vec_id_set<ClusterNetId>::iterator net_iterator;
-
-    typedef vtr::Range<pin_iterator> pin_range;
-    typedef vtr::Range<net_iterator> net_range;
-
-  public: //Lifetime
-    PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
-                      const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                      std::shared_ptr<const SetupTimingInfo> timing_info);
-    PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete;
-    PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete;
-
-  public: //Accessors
-    ///@brief Returns the setup slack of the specified connection.
-    float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; }
-
-    /**
-     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
-     *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
-     */
-    pin_range pins_with_modified_setup_slack() const;
-
-  public: //Modifiers
-    /**
-     * @brief Updates setup slacks based on the atom netlist setup slacks provided
-     *        by timing_info_.
-     *
-     * Should consistently call this method after the most recent timing analysis to
-     * keep the setup slacks stored in this class in sync with the timing analyzer.
-     * If out of sync, then the setup slacks cannot be incrementally updated on
-     * during the next timing analysis iteration.
-     */
-    void update_setup_slacks();
-
-    ///@bried Enable the recompute_required flag to enforce from scratch update.
-    void set_recompute_required() { recompute_required = true; }
-
-    ///@brief Override the setup slack of a particular connection.
-    void set_setup_slack(ClusterNetId net, int ipin, float slack_val);
-
-    ///@brief Set `update_enabled` to true.
-    void enable_update() { update_enabled = true; }
-
-    ///@brief Set `update_enabled` to true.
-    void disable_update() { update_enabled = false; }
-
-  private: //Data
-    const ClusteredNetlist& clb_nlist_;
-    const ClusteredPinAtomPinsLookup& pin_lookup_;
-    std::shared_ptr<const SetupTimingInfo> timing_info_;
-
-    /**
-     * @brief The matrix that stores raw setup slack values for each connection.
-     *
-     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
-     */
-    ClbNetPinsMatrix<float> timing_place_setup_slacks_;
-
-    ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks()
-    vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_setup_slack_;
-
-    ///@brief Incremental update. See timing_place.cpp for more.
-    void incr_update_setup_slacks();
-
-    ///@brief Incremental update. See timing_place.cpp for more.
-    void recompute_setup_slacks();
-
-    ///@brief Flag that turns on/off the update_setup_slacks() routine.
-    bool update_enabled = true;
-
-    /**
-     * @brief Flag that checks if setup slacks need to be recomputed for all connections.
-     *
-     * Used by the method update_setup_slacks(). They incremental update is not possible
-     * if this method wasn't called updated after the previous timing info update.
-     */
-    bool recompute_required = true;
-};
-
 /**
  * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from:
  *        [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1].
@@ -541,7 +290,7 @@ class PlacerTimingCosts {
         double node_cost = total_cost_recurr(left_child(inode))
                            + total_cost_recurr(right_child(inode));
 
-        //Save intermedate cost at this node
+        //Save intermediate cost at this node
         connection_costs_[inode] = node_cost;
 
         return node_cost;
@@ -563,7 +312,7 @@ class PlacerTimingCosts {
     ///@brief Friend-ed so it can call invalidate().
     friend ConnectionProxy;
 
-    void invalidate(double* invalidated_cost) {
+    void invalidate(const double* invalidated_cost) {
         //Check pointer within range of internal storage
         VTR_ASSERT_SAFE_MSG(
             invalidated_cost >= &connection_costs_[0],
@@ -632,12 +381,12 @@ class PlacerTimingCosts {
      * the tree are the intermediate nodes.
      *
      * The methods left_child()/right_child()/parent() can be used
-     * to traverse the tree by indicies into this vector.
+     * to traverse the tree by indices into this vector.
      */
     std::vector<double> connection_costs_;
 
     /**
-     * @brief Vector storing the indicies of the first connection
+     * @brief Vector storing the indices of the first connection
      *        for each net in the netlist, used for indexing by net.
      */
     vtr::vector<ClusterNetId, int> net_start_indicies_;

From a5a036f7d0fecc52913e39a29a33775903ffa430 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 1 Dec 2024 17:57:29 -0500
Subject: [PATCH 19/39] move highly_crit_pins from PlacerMoveContext to
 PlacerCriticalities

---
 vpr/src/place/annealer.cpp                    |  4 +-
 .../centroid_move_generator.cpp               |  1 +
 .../critical_uniform_move_generator.cpp       |  5 +-
 .../feasible_region_move_generator.cpp        |  1 +
 .../move_generators/median_move_generator.cpp |  1 +
 .../uniform_move_generator.cpp                |  1 +
 .../weighted_median_move_generator.cpp        |  1 +
 vpr/src/place/move_utils.cpp                  | 31 +++++----
 vpr/src/place/move_utils.h                    |  4 ++
 vpr/src/place/placer_state.h                  |  3 -
 vpr/src/place/timing/PlacerCriticalities.cpp  | 63 ++++++-------------
 vpr/src/place/timing/PlacerCriticalities.h    | 34 ++++++++--
 vpr/src/place/timing/place_timing_update.cpp  |  8 +--
 vpr/src/place/timing/place_timing_update.h    |  3 +-
 14 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/vpr/src/place/annealer.cpp b/vpr/src/place/annealer.cpp
index 42fd3356709..e6e0ffc85dd 100644
--- a/vpr/src/place/annealer.cpp
+++ b/vpr/src/place/annealer.cpp
@@ -490,7 +490,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
             criticalities_->disable_update();
             setup_slacks_->enable_update();
             update_timing_classes(crit_params, timing_info_, criticalities_,
-                                  setup_slacks_, pin_timing_invalidator_, placer_state_);
+                                  setup_slacks_, pin_timing_invalidator_);
 
             /* Get the setup slack analysis cost */
             //TODO: calculate a weighted average of the slack cost and wiring cost
@@ -594,7 +594,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
 
                 // Revert the timing update
                 update_timing_classes(crit_params, timing_info_, criticalities_,
-                                      setup_slacks_, pin_timing_invalidator_, placer_state_);
+                                      setup_slacks_, pin_timing_invalidator_);
 
                 VTR_ASSERT_SAFE_MSG(
                     verify_connection_setup_slacks(setup_slacks_, placer_state_),
diff --git a/vpr/src/place/move_generators/centroid_move_generator.cpp b/vpr/src/place/move_generators/centroid_move_generator.cpp
index 45ba9121719..767fbf2ce7e 100644
--- a/vpr/src/place/move_generators/centroid_move_generator.cpp
+++ b/vpr/src/place/move_generators/centroid_move_generator.cpp
@@ -44,6 +44,7 @@ e_create_move CentroidMoveGenerator::propose_move(t_pl_blocks_to_be_moved& block
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/false,
+                                                  /*placer_criticalities=*/nullptr,
                                                   /*net_from=*/nullptr,
                                                   /*pin_from=*/nullptr,
                                                   placer_state,
diff --git a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
index 7a1d39ed308..ab1039ae3d0 100644
--- a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
+++ b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
@@ -13,8 +13,8 @@ e_create_move CriticalUniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved
                                                          t_propose_action& proposed_action,
                                                          float rlim,
                                                          const t_placer_opts& placer_opts,
-                                                         const PlacerCriticalities* /*criticalities*/) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+                                                         const PlacerCriticalities* criticalities) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
     const auto& placer_state = placer_state_.get();
     const auto& block_locs = placer_state.block_locs();
     const auto& blk_loc_registry = placer_state.blk_loc_registry();
@@ -25,6 +25,7 @@ e_create_move CriticalUniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/true,
+                                                  criticalities,
                                                   &net_from,
                                                   &pin_from,
                                                   placer_state,
diff --git a/vpr/src/place/move_generators/feasible_region_move_generator.cpp b/vpr/src/place/move_generators/feasible_region_move_generator.cpp
index 75210dafd43..1c719a7b0ff 100644
--- a/vpr/src/place/move_generators/feasible_region_move_generator.cpp
+++ b/vpr/src/place/move_generators/feasible_region_move_generator.cpp
@@ -30,6 +30,7 @@ e_create_move FeasibleRegionMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/true,
+                                                  criticalities,
                                                   &net_from,
                                                   &pin_from,
                                                   placer_state,
diff --git a/vpr/src/place/move_generators/median_move_generator.cpp b/vpr/src/place/move_generators/median_move_generator.cpp
index 2e982ac6425..99c1b892e17 100644
--- a/vpr/src/place/move_generators/median_move_generator.cpp
+++ b/vpr/src/place/move_generators/median_move_generator.cpp
@@ -28,6 +28,7 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/false,
+                                                  /*placer_criticalities=*/nullptr,
                                                   /*net_from=*/nullptr,
                                                   /*pin_from=*/nullptr,
                                                   placer_state,
diff --git a/vpr/src/place/move_generators/uniform_move_generator.cpp b/vpr/src/place/move_generators/uniform_move_generator.cpp
index 6c6e283ba94..7190918aba3 100644
--- a/vpr/src/place/move_generators/uniform_move_generator.cpp
+++ b/vpr/src/place/move_generators/uniform_move_generator.cpp
@@ -24,6 +24,7 @@ e_create_move UniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/false,
+                                                  /*placer_criticalities=*/nullptr,
                                                   /*net_from=*/nullptr,
                                                   /*pin_from=*/nullptr,
                                                   placer_state,
diff --git a/vpr/src/place/move_generators/weighted_median_move_generator.cpp b/vpr/src/place/move_generators/weighted_median_move_generator.cpp
index b391509f5c3..de949d37a75 100644
--- a/vpr/src/place/move_generators/weighted_median_move_generator.cpp
+++ b/vpr/src/place/move_generators/weighted_median_move_generator.cpp
@@ -30,6 +30,7 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     ClusterBlockId b_from = propose_block_to_move(placer_opts,
                                                   proposed_action.logical_blk_type_index,
                                                   /*highly_crit_block=*/false,
+                                                  /*placer_criticalities=*/nullptr,
                                                   /*net_from=*/nullptr,
                                                   /*pin_from=*/nullptr,
                                                   placer_state,
diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index b5efb699fc7..78623200f42 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -547,16 +547,17 @@ void enable_placer_debug(const t_placer_opts& placer_opts,
 ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts,
                                      int& logical_blk_type_index,
                                      bool highly_crit_block,
+                                     const PlacerCriticalities* placer_criticalities,
                                      ClusterNetId* net_from,
                                      int* pin_from,
                                      const PlacerState& placer_state,
                                      vtr::RngContainer& rng) {
     ClusterBlockId b_from = ClusterBlockId::INVALID();
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
 
     if (logical_blk_type_index == -1) { //If the block type is unspecified, choose any random block to be swapped with another random block
         if (highly_crit_block) {
-            b_from = pick_from_highly_critical_block(*net_from, *pin_from, placer_state, rng);
+            b_from = pick_from_highly_critical_block(*net_from, *pin_from, placer_state, *placer_criticalities, rng);
         } else {
             b_from = pick_from_block(rng);
         }
@@ -567,7 +568,7 @@ ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts,
         }
     } else { //If the block type is specified, choose a random block with blk_type to be swapped with another random block
         if (highly_crit_block) {
-            b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, rng);
+            b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, *placer_criticalities, rng);
         } else {
             b_from = pick_from_block(logical_blk_type_index, rng);
         }
@@ -624,22 +625,24 @@ ClusterBlockId pick_from_block(const int logical_blk_type_index, vtr::RngContain
 ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,
                                                const PlacerState& placer_state,
+                                               const PlacerCriticalities& placer_criticalities,
                                                vtr::RngContainer& rng) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& place_move_ctx = placer_state.move();
-    auto& block_locs = placer_state.block_locs();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& block_locs = placer_state.block_locs();
 
     //Initialize critical net and pin to be invalid
     net_from = ClusterNetId::INVALID();
     pin_from = -1;
 
+    const auto& highly_crit_pins = placer_criticalities.get_highly_critical_pins();
+
     //check if any critical block is available
-    if (place_move_ctx.highly_crit_pins.empty()) {
+    if (highly_crit_pins.empty()) {
         return ClusterBlockId::INVALID();
     }
 
     //pick a random highly critical pin and find the nets driver block
-    std::pair<ClusterNetId, int> crit_pin = place_move_ctx.highly_crit_pins[rng.irand(place_move_ctx.highly_crit_pins.size() - 1)];
+    std::pair<ClusterNetId, int> crit_pin = highly_crit_pins[rng.irand(highly_crit_pins.size() - 1)];
     ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first);
 
     if (block_locs[b_from].is_fixed) {
@@ -660,22 +663,24 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,
                                                const int logical_blk_type_index,
                                                const PlacerState& placer_state,
+                                               const PlacerCriticalities& placer_criticalities,
                                                vtr::RngContainer& rng) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& place_move_ctx = placer_state.move();
-    auto& block_locs = placer_state.block_locs();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& block_locs = placer_state.block_locs();
 
     //Initialize critical net and pin to be invalid
     net_from = ClusterNetId::INVALID();
     pin_from = -1;
 
+    const auto& highly_crit_pins = placer_criticalities.get_highly_critical_pins();
+
     //check if any critical block is available
-    if (place_move_ctx.highly_crit_pins.empty()) {
+    if (highly_crit_pins.empty()) {
         return ClusterBlockId::INVALID();
     }
 
     //pick a random highly critical pin and find the nets driver block
-    std::pair<ClusterNetId, int> crit_pin = place_move_ctx.highly_crit_pins[rng.irand(place_move_ctx.highly_crit_pins.size() - 1)];
+    std::pair<ClusterNetId, int> crit_pin = highly_crit_pins[rng.irand(highly_crit_pins.size() - 1)];
     ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first);
 
     //Check if picked block type matches with the blk_type specified, and it is not fixed
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index de3d771e7ae..1aa5591f5c8 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -7,6 +7,7 @@
 
 class PlacerState;
 class BlkLocRegistry;
+class PlacerCriticalities;
 namespace vtr {
 class RngContainer;
 }
@@ -171,6 +172,7 @@ bool is_legal_swap_to_location(ClusterBlockId blk,
 ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts,
                                      int& logical_blk_type_index,
                                      bool highly_crit_block,
+                                     const PlacerCriticalities* placer_criticalities,
                                      ClusterNetId* net_from,
                                      int* pin_from,
                                      const PlacerState& placer_state,
@@ -207,6 +209,7 @@ ClusterBlockId pick_from_block(int logical_blk_type_index, vtr::RngContainer& rn
 ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,
                                                const PlacerState& placer_state,
+                                               const PlacerCriticalities& placer_criticalities,
                                                vtr::RngContainer& rng);
 
 /**
@@ -220,6 +223,7 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,
                                                int logical_blk_type_index,
                                                const PlacerState& placer_state,
+                                               const PlacerCriticalities& placer_criticalities,
                                                vtr::RngContainer& rng);
 
 bool find_to_loc_uniform(t_logical_block_type_ptr type,
diff --git a/vpr/src/place/placer_state.h b/vpr/src/place/placer_state.h
index 8f3b966a56d..35f1ec73766 100644
--- a/vpr/src/place/placer_state.h
+++ b/vpr/src/place/placer_state.h
@@ -145,9 +145,6 @@ struct PlacerMoveContext : public Context {
     std::vector<int> X_coord;
     std::vector<int> Y_coord;
     std::vector<int> layer_coord;
-
-    // Container to save the highly critical pins (higher than a timing criticality limit set by commandline option)
-    std::vector<std::pair<ClusterNetId, int>> highly_crit_pins;
 };
 
 
diff --git a/vpr/src/place/timing/PlacerCriticalities.cpp b/vpr/src/place/timing/PlacerCriticalities.cpp
index 8aa248abab6..ccf1028283c 100644
--- a/vpr/src/place/timing/PlacerCriticalities.cpp
+++ b/vpr/src/place/timing/PlacerCriticalities.cpp
@@ -3,9 +3,7 @@
 
 #include "timing_info.h"
 #include "timing_util.h"
-#include "placer_state.h"
 
-///@brief Allocates space for the timing_place_crit_ data structure.
 PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist,
                                          const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                          std::shared_ptr<const SetupTimingInfo> timing_info)
@@ -25,55 +23,51 @@ PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist,
  *
  * If the criticality exponent has changed, we also need to update from scratch.
  */
-void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_params,
-                                               PlacerState& placer_state) {
-    /* If update is not enabled, exit the routine. */
+void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_params) {
+    // If update is not enabled, exit the routine.
     if (!update_enabled) {
-        /* re-computation is required on the next iteration */
+        // re-computation is required on the next iteration
         recompute_required = true;
         return;
     }
 
-    /* Determine what pins need updating */
+    // Determine what pins need updating
     if (!recompute_required && crit_params.crit_exponent == last_crit_exponent_) {
         incr_update_criticalities();
     } else {
         recompute_criticalities();
 
-        /* Record new criticality exponent */
+        // Record new criticality exponent
         last_crit_exponent_ = crit_params.crit_exponent;
     }
 
-    auto& place_move_ctx = placer_state.mutable_move();
-
     /* Performs a 1-to-1 mapping from criticality to timing_place_crit_.
      * For every pin on every net (or, equivalently, for every tedge ending
      * in that pin), timing_place_crit_ = criticality^(criticality exponent) */
 
-    /* Update the affected pins */
+    // Update the affected pins
     for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) {
         ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
         int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
-        // Routing for placement is not flat (at least for the time being)
-        float clb_pin_crit = calculate_clb_net_pin_criticality(*timing_info_, pin_lookup_, ParentPinId(size_t(clb_pin)), /*is_flat=*/false);
 
+        float clb_pin_crit = calculate_clb_net_pin_criticality(*timing_info_, pin_lookup_, ParentPinId(size_t(clb_pin)), /*is_flat=*/false);
         float new_crit = pow(clb_pin_crit, crit_params.crit_exponent);
-        /*
-         * Update the highly critical pins container
+
+        /* Update the highly critical pins container
          *
          * If the old criticality < limit and the new criticality > limit --> add this pin to the highly critical pins
          * If the old criticality > limit and the new criticality < limit --> remove this pin from the highly critical pins
          */
         if (!first_time_update_criticality) {
             if (new_crit > crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] < crit_params.crit_limit) {
-                place_move_ctx.highly_crit_pins.emplace_back(clb_net, pin_index_in_net);
+                highly_crit_pins.emplace_back(clb_net, pin_index_in_net);
             } else if (new_crit < crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] > crit_params.crit_limit) {
-                place_move_ctx.highly_crit_pins.erase(std::remove(place_move_ctx.highly_crit_pins.begin(), place_move_ctx.highly_crit_pins.end(), std::make_pair(clb_net, pin_index_in_net)),
-                                                      place_move_ctx.highly_crit_pins.end());
+                highly_crit_pins.erase(std::remove(highly_crit_pins.begin(), highly_crit_pins.end(), std::make_pair(clb_net, pin_index_in_net)),
+                                       highly_crit_pins.end());
             }
         } else {
             if (new_crit > crit_params.crit_limit) {
-                place_move_ctx.highly_crit_pins.emplace_back(clb_net, pin_index_in_net);
+                highly_crit_pins.emplace_back(clb_net, pin_index_in_net);
             }
         }
 
@@ -94,42 +88,25 @@ void PlacerCriticalities::set_recompute_required() {
     recompute_required = true;
 }
 
-/**
- * @brief Collect the cluster pins which need to be updated based on the latest timing
- *        analysis so that incremental updates to criticalities can be performed.
- *
- * Note we use the set of pins reported by the *timing_info* as having modified
- * criticality, rather than those marked as modified by the timing analyzer.
- *
- * Since timing_info uses shifted/relaxed criticality (which depends on max required
- * time and worst case slacks), additional nodes may be modified when updating the
- * atom pin criticalities.
- */
-
 void PlacerCriticalities::incr_update_criticalities() {
     cluster_pins_with_modified_criticality_.clear();
 
     for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_criticality()) {
         ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
 
-        //Some atom pins correspond to connections which are completely
-        //contained within a cluster, and hence have no corresponding
-        //clustered pin.
+        /* Some atom pins correspond to connections which are completely
+         * contained within a cluster, and hence have no corresponding
+         * clustered pin. */
         if (!clb_pin) continue;
 
         cluster_pins_with_modified_criticality_.insert(clb_pin);
     }
 }
 
-/**
- * @brief Collect all the sink pins in the netlist and prepare them update.
- *
- * For the incremental version, see PlacerCriticalities::incr_update_criticalities().
- */
 void PlacerCriticalities::recompute_criticalities() {
     cluster_pins_with_modified_criticality_.clear();
 
-    /* Non-incremental: all sink pins need updating */
+    // Non-incremental: all sink pins need updating
     for (ClusterNetId net_id : clb_nlist_.nets()) {
         for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
             cluster_pins_with_modified_criticality_.insert(pin_id);
@@ -145,10 +122,6 @@ void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float c
     timing_place_crit_[net_id][ipin] = crit_val;
 }
 
-/**
- * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which
- *        were modified by the last call to PlacerCriticalities::update_criticalities().
- */
 PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const {
     return vtr::make_range(cluster_pins_with_modified_criticality_);
-}
\ No newline at end of file
+}
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index 7f7a1975ff2..4a6c5518eb2 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -67,9 +67,12 @@ class PlacerCriticalities {
     typedef vtr::Range<net_iterator> net_range;
 
   public: //Lifetime
+
+    ///@brief Allocates space for the timing_place_crit_ data structure.
     PlacerCriticalities(const ClusteredNetlist& clb_nlist,
                         const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                         std::shared_ptr<const SetupTimingInfo> timing_info);
+
     PlacerCriticalities(const PlacerCriticalities&) = delete;
     PlacerCriticalities& operator=(const PlacerCriticalities&) = delete;
 
@@ -83,9 +86,12 @@ class PlacerCriticalities {
      */
     pin_range pins_with_modified_criticality() const;
 
+    /// @brief Returns a constant reference to highly critical pins
+    const std::vector<std::pair<ClusterNetId, int>>& get_highly_critical_pins() const { return highly_crit_pins; }
+
   public: //Modifiers
     /**
-     * @brief Updates criticalities based on the atom netlist criticalitites
+     * @brief Updates criticalities based on the atom netlist criticalities
      *        provided by timing_info and the provided criticality_exponent.
      *
      * Should consistently call this method after the most recent timing analysis to
@@ -93,13 +99,16 @@ class PlacerCriticalities {
      * If out of sync, then the criticalities cannot be incrementally updated on
      * during the next timing analysis iteration.
      */
-    void update_criticalities(const PlaceCritParams& crit_params,
-                              PlacerState& placer_state);
+    void update_criticalities(const PlaceCritParams& crit_params);
 
     ///@bried Enable the recompute_required flag to enforce from scratch update.
     void set_recompute_required();
 
-    ///@brief From scratch update. See timing_place.cpp for more.
+    /**
+     * @brief Collect all the sink pins in the netlist and prepare them update.
+     *
+     * For the incremental version, see PlacerCriticalities::incr_update_criticalities().
+     */
     void recompute_criticalities();
 
     ///@brief Override the criticality of a particular connection.
@@ -134,10 +143,20 @@ class PlacerCriticalities {
      */
     float last_crit_exponent_ = std::numeric_limits<float>::quiet_NaN();
 
-    ///@brief Set of pins with criticaltites modified by last call to update_criticalities().
+    ///@brief Set of pins with criticalities modified by last call to update_criticalities().
     vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_criticality_;
 
-    ///@brief Incremental update. See timing_place.cpp for more.
+    /**
+     * @brief Collect the cluster pins which need to be updated based on the latest timing
+     *        analysis so that incremental updates to criticalities can be performed.
+     *
+     * Note we use the set of pins reported by the *timing_info* as having modified
+     * criticality, rather than those marked as modified by the timing analyzer.
+     *
+     * Since timing_info uses shifted/relaxed criticality (which depends on max required
+     * time and worst case slacks), additional nodes may be modified when updating the
+     * atom pin criticalities.
+     */
     void incr_update_criticalities();
 
     ///@brief Flag that turns on/off the update_criticalities() routine.
@@ -157,4 +176,7 @@ class PlacerCriticalities {
      * This can be used for incremental criticality update and also incrementally update the highly critical pins
      */
     bool first_time_update_criticality = true;
+
+    /// @brief Saves the highly critical pins (higher than a timing criticality limit set by commandline option)
+    std::vector<std::pair<ClusterNetId, int>> highly_crit_pins;
 };
diff --git a/vpr/src/place/timing/place_timing_update.cpp b/vpr/src/place/timing/place_timing_update.cpp
index 00cad07da7b..246db01f97d 100644
--- a/vpr/src/place/timing/place_timing_update.cpp
+++ b/vpr/src/place/timing/place_timing_update.cpp
@@ -99,8 +99,7 @@ void perform_full_timing_update(const PlaceCritParams& crit_params,
                           timing_info,
                           criticalities,
                           setup_slacks,
-                          pin_timing_invalidator,
-                          placer_state);
+                          pin_timing_invalidator);
 
     /* Update the timing cost with new connection criticalities. */
     update_timing_cost(delay_model,
@@ -141,13 +140,12 @@ void update_timing_classes(const PlaceCritParams& crit_params,
                            SetupTimingInfo* timing_info,
                            PlacerCriticalities* criticalities,
                            PlacerSetupSlacks* setup_slacks,
-                           NetPinTimingInvalidator* pin_timing_invalidator,
-                           PlacerState& placer_state) {
+                           NetPinTimingInvalidator* pin_timing_invalidator) {
     /* Run STA to update slacks and adjusted/relaxed criticalities. */
     timing_info->update();
 
     /* Update the placer's criticalities (e.g. sharpen with crit_exponent). */
-    criticalities->update_criticalities(crit_params, placer_state);
+    criticalities->update_criticalities(crit_params);
 
     /* Update the placer's raw setup slacks. */
     setup_slacks->update_setup_slacks();
diff --git a/vpr/src/place/timing/place_timing_update.h b/vpr/src/place/timing/place_timing_update.h
index 6ced93e4487..8e7a0dc1f46 100644
--- a/vpr/src/place/timing/place_timing_update.h
+++ b/vpr/src/place/timing/place_timing_update.h
@@ -39,8 +39,7 @@ void update_timing_classes(const PlaceCritParams& crit_params,
                            SetupTimingInfo* timing_info,
                            PlacerCriticalities* criticalities,
                            PlacerSetupSlacks* setup_slacks,
-                           NetPinTimingInvalidator* pin_timing_invalidator,
-                           PlacerState& placer_state);
+                           NetPinTimingInvalidator* pin_timing_invalidator);
 
 ///@brief Updates the timing driven (td) costs.
 void update_timing_cost(const PlaceDelayModel* delay_model,

From 871b2891e3fd9eb35c597aa16014410de2a20b2a Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 1 Dec 2024 19:02:53 -0500
Subject: [PATCH 20/39] last commit before I go home it doesn't compile

---
 vpr/src/place/timing/PlacerCriticalities.cpp |   4 +-
 vpr/src/place/timing/PlacerSetupSlacks.cpp   |  33 +-
 vpr/src/place/timing/PlacerSetupSlacks.h     |  16 +-
 vpr/src/place/timing/PlacerTimingCosts.cpp   |  60 ++++
 vpr/src/place/timing/PlacerTimingCosts.h     | 303 ++++++++++++++++
 vpr/src/place/timing/timing_place.h          | 358 -------------------
 6 files changed, 387 insertions(+), 387 deletions(-)
 create mode 100644 vpr/src/place/timing/PlacerTimingCosts.cpp
 create mode 100644 vpr/src/place/timing/PlacerTimingCosts.h

diff --git a/vpr/src/place/timing/PlacerCriticalities.cpp b/vpr/src/place/timing/PlacerCriticalities.cpp
index ccf1028283c..1f2e4f518e9 100644
--- a/vpr/src/place/timing/PlacerCriticalities.cpp
+++ b/vpr/src/place/timing/PlacerCriticalities.cpp
@@ -77,8 +77,8 @@ void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_param
         timing_place_crit_[clb_net][pin_index_in_net] = new_crit;
     }
 
-    /* Criticalities updated. In sync with timing info.   */
-    /* Can be incrementally updated on the next iteration */
+    /* Criticalities updated. In sync with timing info.
+     * Can be incrementally updated on the next iteration */
     recompute_required = false;
 
     first_time_update_criticality = false;
diff --git a/vpr/src/place/timing/PlacerSetupSlacks.cpp b/vpr/src/place/timing/PlacerSetupSlacks.cpp
index ffc637f423b..3a097a582ff 100644
--- a/vpr/src/place/timing/PlacerSetupSlacks.cpp
+++ b/vpr/src/place/timing/PlacerSetupSlacks.cpp
@@ -4,7 +4,7 @@
 #include "timing_util.h"
 #include "timing_info.h"
 
-///@brief Allocates space for the timing_place_setup_slacks_ data structure.
+
 PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
                                      const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                      std::shared_ptr<const SetupTimingInfo> timing_info)
@@ -25,21 +25,21 @@ PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
  * from scratch.
  */
 void PlacerSetupSlacks::update_setup_slacks() {
-    /* If update is not enabled, exit the routine. */
+    // If update is not enabled, exit the routine.
     if (!update_enabled) {
-        /* re-computation is required on the next iteration */
+        // re-computation is required on the next iteration
         recompute_required = true;
         return;
     }
 
-    /* Determine what pins need updating */
+    // Determine what pins need updating
     if (!recompute_required) {
         incr_update_setup_slacks();
     } else {
         recompute_setup_slacks();
     }
 
-    /* Update the affected pins */
+    // Update the affected pins
     for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) {
         ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
         int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
@@ -49,18 +49,11 @@ void PlacerSetupSlacks::update_setup_slacks() {
         timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack;
     }
 
-    /* Setup slacks updated. In sync with timing info.     */
-    /* Can be incrementally updated on the next iteration. */
+    /* Setup slacks updated. In sync with timing info.
+     * Can be incrementally updated on the next iteration. */
     recompute_required = false;
 }
 
-/**
- * @brief Collect the cluster pins which need to be updated based on the latest timing
- *        analysis so that incremental updates to setup slacks can be performed.
- *
- * Note we use the set of pins reported by the *timing_info* as having modified
- * setup slacks, rather than those marked as modified by the timing analyzer.
- */
 void PlacerSetupSlacks::incr_update_setup_slacks() {
     cluster_pins_with_modified_setup_slack_.clear();
 
@@ -76,15 +69,10 @@ void PlacerSetupSlacks::incr_update_setup_slacks() {
     }
 }
 
-/**
- * @brief Collect all the sink pins in the netlist and prepare them update.
- *
- * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks().
- */
 void PlacerSetupSlacks::recompute_setup_slacks() {
     cluster_pins_with_modified_setup_slack_.clear();
 
-    /* Non-incremental: all sink pins need updating */
+    // Non-incremental: all sink pins need updating
     for (ClusterNetId net_id : clb_nlist_.nets()) {
         for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
             cluster_pins_with_modified_setup_slack_.insert(pin_id);
@@ -92,7 +80,6 @@ void PlacerSetupSlacks::recompute_setup_slacks() {
     }
 }
 
-///@brief Override the setup slack of a particular connection.
 void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) {
     VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)");
     VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout");
@@ -100,10 +87,6 @@ void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float sla
     timing_place_setup_slacks_[net_id][ipin] = slack_val;
 }
 
-/**
- * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
- *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
- */
 PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const {
     return vtr::make_range(cluster_pins_with_modified_setup_slack_);
 }
diff --git a/vpr/src/place/timing/PlacerSetupSlacks.h b/vpr/src/place/timing/PlacerSetupSlacks.h
index 580a26db2c2..7ffc450e94b 100644
--- a/vpr/src/place/timing/PlacerSetupSlacks.h
+++ b/vpr/src/place/timing/PlacerSetupSlacks.h
@@ -33,9 +33,11 @@ class PlacerSetupSlacks {
     typedef vtr::Range<net_iterator> net_range;
 
   public: //Lifetime
+    ///@brief Allocates space for the timing_place_setup_slacks_ data structure.
     PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
                       const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                       std::shared_ptr<const SetupTimingInfo> timing_info);
+
     PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete;
     PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete;
 
@@ -88,10 +90,20 @@ class PlacerSetupSlacks {
     ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks()
     vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_setup_slack_;
 
-    ///@brief Incremental update. See timing_place.cpp for more.
+    /**
+     * @brief Collect the cluster pins which need to be updated based on the latest timing
+     *        analysis so that incremental updates to setup slacks can be performed.
+     *
+     * Note we use the set of pins reported by the *timing_info* as having modified
+     * setup slacks, rather than those marked as modified by the timing analyzer.
+     */
     void incr_update_setup_slacks();
 
-    ///@brief Incremental update. See timing_place.cpp for more.
+    /**
+     * @brief Collect all the sink pins in the netlist and prepare them update.
+     *
+     * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks().
+     */
     void recompute_setup_slacks();
 
     ///@brief Flag that turns on/off the update_setup_slacks() routine.
diff --git a/vpr/src/place/timing/PlacerTimingCosts.cpp b/vpr/src/place/timing/PlacerTimingCosts.cpp
new file mode 100644
index 00000000000..c7fe35fc020
--- /dev/null
+++ b/vpr/src/place/timing/PlacerTimingCosts.cpp
@@ -0,0 +1,60 @@
+
+#include "PlacerTimingCosts.h"
+
+
+PlacerTimingCosts::PlacerTimingCosts(const ClusteredNetlist& nlist) {
+    auto nets = nlist.nets();
+
+    net_start_indicies_.resize(nets.size());
+
+    // Walk through the netlist to determine how many connections there are.
+    size_t iconn = 0;
+    for (ClusterNetId net : nets) {
+        // The placer always skips 'ignored' nets, so they don't affect timing
+        // costs, so we also skip them here
+        if (nlist.net_is_ignored(net)) {
+            net_start_indicies_[net] = OPEN;
+            continue;
+        }
+
+        // Save the starting index of the current net's connections.
+        // We use a -1 offset, since sinks indexed from [1..num_net_pins-1]
+        // (there is no timing cost associated with net drivers)
+        net_start_indicies_[net] = iconn - 1;
+
+        // Reserve space for all this net's connections
+        iconn += nlist.net_sinks(net).size();
+    }
+
+    const size_t num_connections = iconn;
+
+    // Determine how many binary tree levels we need to have a leaf for each connection cost
+    size_t ilevel = 0;
+    while (num_nodes_in_level(ilevel) < num_connections) {
+        ++ilevel;
+    }
+    num_levels_ = ilevel + 1;
+
+    size_t num_leaves = num_nodes_in_level(ilevel);
+    size_t num_nodes_in_previous_level = num_nodes_in_level(ilevel - 1);
+
+    VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections");
+    VTR_ASSERT_MSG(num_connections == 0 || num_nodes_in_previous_level < num_connections,
+                   "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
+
+    // We don't need to store all possible leaves if we have fewer connections (i.e. bottom-right of tree is empty)
+    size_t last_level_unused_nodes = num_nodes_in_level(ilevel) - num_connections;
+    size_t num_nodes = num_nodes_up_to_level(ilevel) - last_level_unused_nodes;
+
+    // Reserve space for connection costs and intermediate node values
+    connection_costs_ = std::vector<double>(num_nodes, std::numeric_limits<double>::quiet_NaN());
+
+    // The net start indices we calculated earlier didn't account for intermediate binary tree nodes
+    // Shift the start indices after the intermediate nodes
+    size_t num_intermediate_nodes = num_nodes_up_to_level(ilevel - 1);
+    for (ClusterNetId net : nets) {
+        if (nlist.net_is_ignored(net)) continue;
+
+        net_start_indicies_[net] = net_start_indicies_[net] + num_intermediate_nodes;
+    }
+}
diff --git a/vpr/src/place/timing/PlacerTimingCosts.h b/vpr/src/place/timing/PlacerTimingCosts.h
new file mode 100644
index 00000000000..f84f4446466
--- /dev/null
+++ b/vpr/src/place/timing/PlacerTimingCosts.h
@@ -0,0 +1,303 @@
+
+#pragma once
+#include "vtr_vec_id_set.h"
+#include "timing_info_fwd.h"
+#include "clustered_netlist_utils.h"
+#include "place_delay_model.h"
+#include "vpr_net_pins_matrix.h"
+
+/**
+ * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from:
+ *        [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1].
+ *
+ * It can be used similar to:
+ *
+ *      PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct
+ *
+ *      //...
+ *
+ *      //Modify a connection cost
+ *      connection_timing_costs[net_id][ipin] = new_cost;
+ *
+ *      //Potentially other modifications...
+ *
+ *      //Calculate the updated timing cost, of all connections,
+ *      //incrementally based on modifications
+ *      float total_timing_cost = connection_timing_costs.total_cost();
+ *
+ * However behind the scenes PlacerTimingCosts tracks when connection costs are modified,
+ * and efficiently re-calculates the total timing cost incrementally based on the connections
+ * which have had their cost modified.
+ *
+ * Implementation
+ * ==============
+ * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part
+ * of connection_costs_.  To mimic 2d-array like access PlacerTimingCosts also uses two proxy
+ * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy
+ * respectively).
+ *
+ * The first part of connection_costs_ stores intermediate sums of the connection costs for
+ * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary
+ * tree, where leaves correspond to individual connection costs and intermediate nodes the
+ * partial sums of the connection costs. (The binary tree is stored implicitly in the
+ * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary
+ * tree we calculate the total timing cost over all connections.
+ *
+ * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset
+ * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up
+ * to the root) which have ancestors (leaves) with modified connection costs. When the
+ * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost.
+ * Only invalidated nodes are traversed, with valid nodes just returning their previously
+ * calculated (and unchanged) value.
+ *
+ * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can
+ * be done in O(k log K) time.
+ *
+ * It is important to note that due to limited floating point precision, floating point
+ * arithmetic has an order dependence (due to round-off). Using a binary tree to total
+ * the timing connection costs allows us to incrementally update the total timing cost while
+ * maintaining the *same order of operations* as if it was re-computed from scratch. This
+ * ensures we *always* get consistent results regardless of what/when connections are changed.
+ *
+ * Proxy Classes
+ * =============
+ * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of
+ * internal storage of that net's connection costs.
+ *
+ * ConnectionProxy is returned by NetProxy's operator[], and holds a reference to a particular
+ * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy
+ * supports assignment, allowing clients to modify the connection cost. It also detects if the
+ * assigned value differs from the previous value and if so, calls PlacerTimingCosts's
+ * invalidate() method on that connection cost.
+ *
+ * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN)
+ * so they will be re-calculated by PlacerTimingCosts' total_cost() method.
+ */
+class PlacerTimingCosts {
+  public:
+    PlacerTimingCosts() = default;
+
+    PlacerTimingCosts(const ClusteredNetlist& nlist);
+
+    /**
+     * @brief Proxy class representing a connection cost.
+     *
+     * Supports modification of connection cost while detecting
+     * changes and reporting them up to PlacerTimingCosts.
+     */
+    class ConnectionProxy {
+      public:
+        ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost)
+            : timing_costs_(timing_costs)
+            , connection_cost_(connection_cost) {}
+
+        ///@brief Allow clients to modify the connection cost via assignment.
+        ConnectionProxy& operator=(double new_cost) {
+            if (new_cost != connection_cost_) {
+                //If connection cost changed, update it, and mark it
+                //as invalidated
+                connection_cost_ = new_cost;
+                timing_costs_->invalidate(&connection_cost_);
+            }
+            return *this;
+        }
+
+        /**
+         * @brief Support getting the current connection cost as a double.
+         *
+         * Useful for client code operating on the cost values (e.g. difference between costs).
+         */
+        operator double() const {
+            return connection_cost_;
+        }
+
+      private:
+        PlacerTimingCosts* timing_costs_;
+        double& connection_cost_;
+    };
+
+    /**
+     * @brief Proxy class representing the connection costs of a net.
+     *
+     * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection.
+     */
+    class NetProxy {
+      public:
+        NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs)
+            : timing_costs_(timing_costs)
+            , net_sink_costs_(net_sink_costs) {}
+
+        ///@brief Indexes into the specific net pin/connection.
+        ConnectionProxy operator[](size_t ipin) {
+            return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]);
+        }
+
+        const ConnectionProxy operator[](size_t ipin) const {
+            return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]);
+        }
+
+      private:
+        PlacerTimingCosts* timing_costs_;
+        double* net_sink_costs_;
+    };
+
+    ///@brief Indexes into the specific net.
+    NetProxy operator[](ClusterNetId net_id) {
+        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
+
+        double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
+        return NetProxy(this, net_connection_costs);
+    }
+
+    NetProxy operator[](ClusterNetId net_id) const {
+        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
+
+        const double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
+        return NetProxy(const_cast<PlacerTimingCosts*>(this), const_cast<double*>(net_connection_costs));
+    }
+
+    void clear() {
+        connection_costs_.clear();
+        net_start_indicies_.clear();
+    }
+
+    void swap(PlacerTimingCosts& other) {
+        std::swap(connection_costs_, other.connection_costs_);
+        std::swap(net_start_indicies_, other.net_start_indicies_);
+        std::swap(num_levels_, other.num_levels_);
+    }
+
+    /**
+     * @brief Calculates the total cost of all connections efficiently
+     *        in the face of modified connection costs.
+     */
+    double total_cost() {
+        float cost = total_cost_recurr(0); //Root
+
+        VTR_ASSERT_DEBUG_MSG(cost == total_cost_from_scratch(0),
+                             "Expected incremental and from-scratch costs to be consistent");
+
+        return cost;
+    }
+
+  private:
+    ///@brief Recursively calculate and update the timing cost rooted at inode.
+    double total_cost_recurr(size_t inode) {
+        //Prune out-of-tree
+        if (inode > connection_costs_.size() - 1) {
+            return 0.;
+        }
+
+        //Valid pre-calculated intermediate result or valid leaf
+        if (!std::isnan(connection_costs_[inode])) {
+            return connection_costs_[inode];
+        }
+
+        //Recompute recursively
+        double node_cost = total_cost_recurr(left_child(inode))
+                           + total_cost_recurr(right_child(inode));
+
+        //Save intermediate cost at this node
+        connection_costs_[inode] = node_cost;
+
+        return node_cost;
+    }
+
+    double total_cost_from_scratch(size_t inode) const {
+        //Prune out-of-tree
+        if (inode > connection_costs_.size() - 1) {
+            return 0.;
+        }
+
+        //Recompute recursively
+        double node_cost = total_cost_from_scratch(left_child(inode))
+                           + total_cost_from_scratch(right_child(inode));
+
+        return node_cost;
+    }
+
+    ///@brief Friend-ed so it can call invalidate().
+    friend ConnectionProxy;
+
+    void invalidate(const double* invalidated_cost) {
+        //Check pointer within range of internal storage
+        VTR_ASSERT_SAFE_MSG(
+            invalidated_cost >= &connection_costs_[0],
+            "Connection cost pointer should be after start of internal storage");
+
+        VTR_ASSERT_SAFE_MSG(
+            invalidated_cost <= &connection_costs_[connection_costs_.size() - 1],
+            "Connection cost pointer should be before end of internal storage");
+
+        size_t icost = invalidated_cost - &connection_costs_[0];
+
+        VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2));
+
+        //Invalidate parent intermediate costs up to root or first
+        //already-invalidated parent
+        size_t iparent = parent(icost);
+
+        while (!std::isnan(connection_costs_[iparent])) {
+            //Invalidate
+            connection_costs_[iparent] = std::numeric_limits<double>::quiet_NaN();
+
+            if (iparent == 0) {
+                break; //At root
+            } else {
+                //Next parent
+                iparent = parent(iparent);
+            }
+        }
+
+        VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root");
+    }
+
+    size_t left_child(size_t i) const {
+        return 2 * i + 1;
+    }
+
+    size_t right_child(size_t i) const {
+        return 2 * i + 2;
+    }
+
+    size_t parent(size_t i) const {
+        return (i - 1) / 2;
+    }
+
+    /**
+     * @brief Returns the number of nodes in ilevel'th level.
+     *
+     * If ilevel is negative, return 0, since the root shouldn't
+     * be counted as a leaf node candidate.
+     */
+    size_t num_nodes_in_level(int ilevel) const {
+        return ilevel < 0 ? 0 : (2 << (ilevel));
+    }
+
+    ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive).
+    size_t num_nodes_up_to_level(int ilevel) const {
+        return (2 << (ilevel + 1)) - 1;
+    }
+
+  private:
+    /**
+     * @brief Vector storing the implicit binary tree of connection costs.
+     *
+     * The actual connections are stored at the end of the vector
+     * (last level of the binary tree). The earlier portions of
+     * the tree are the intermediate nodes.
+     *
+     * The methods left_child()/right_child()/parent() can be used
+     * to traverse the tree by indices into this vector.
+     */
+    std::vector<double> connection_costs_;
+
+    /**
+     * @brief Vector storing the indices of the first connection
+     *        for each net in the netlist, used for indexing by net.
+     */
+    vtr::vector<ClusterNetId, int> net_start_indicies_;
+
+    ///@brief Number of levels in the binary tree.
+    size_t num_levels_ = 0;
+};
diff --git a/vpr/src/place/timing/timing_place.h b/vpr/src/place/timing/timing_place.h
index bd85061065f..54641947803 100644
--- a/vpr/src/place/timing/timing_place.h
+++ b/vpr/src/place/timing/timing_place.h
@@ -34,363 +34,5 @@
  *      calc_relaxed_criticality() in `timing_util.cpp`.
  */
 
-#pragma once
-#include "vtr_vec_id_set.h"
-#include "timing_info_fwd.h"
-#include "clustered_netlist_utils.h"
-#include "place_delay_model.h"
-#include "vpr_net_pins_matrix.h"
 
-/**
- * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from:
- *        [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1].
- *
- * It can be used similar to:
- *
- *      PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct
- *
- *      //...
- *
- *      //Modify a connection cost
- *      connection_timing_costs[net_id][ipin] = new_cost;
- *
- *      //Potentially other modifications...
- *
- *      //Calculate the updated timing cost, of all connections,
- *      //incrementally based on modifications
- *      float total_timing_cost = connection_timing_costs.total_cost();
- *
- * However behind the scenes PlacerTimingCosts tracks when connection costs are modified,
- * and efficiently re-calculates the total timing cost incrementally based on the connections
- * which have had their cost modified.
- *
- * Implementation
- * ==============
- * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part
- * of connection_costs_.  To mimic 2d-array like access PlacerTimingCosts also uses two proxy
- * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy
- * respectively).
- *
- * The first part of connection_costs_ stores intermediate sums of the connection costs for
- * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary
- * tree, where leaves correspond to individual connection costs and intermediate nodes the
- * partial sums of the connection costs. (The binary tree is stored implicitly in the
- * connection_costs_  vector, using Eytzinger's/BFS layout.) By summing the entire binary
- * tree we calculate the total timing cost over all connections.
- *
- * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset
- * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up
- * to the root) which have ancestors (leaves) with modified connection costs. When the
- * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost.
- * Only invalidated nodes are traversed, with valid nodes just returning their previously
- * calculated (and unchanged) value.
- *
- * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can
- * be done in O(k log K) time.
- *
- * It is important to note that due to limited floating point precision, floating point
- * arithmetic has an order dependence (due to round-off). Using a binary tree to total
- * the timing connection costs allows us to incrementally update the total timing cost while
- * maintianing the *same order of operations* as if it was re-computed from scratch. This
- * ensures we *always* get consistent results regardless of what/when connections are changed.
- *
- * Proxy Classes
- * =============
- * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of
- * internal storage of that net's connection costs.
- *
- * ConnectionProxy is returned by NetProxy's operator[], and holds a reference to a particular
- * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy
- * supports assignment, allowing clients to modify the connection cost. It also detects if the
- * assigned value differs from the previous value and if so, calls PlacerTimingCosts's
- * invalidate() method on that connection cost.
- *
- * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN)
- * so they will be re-calculated by PlacerTimingCosts' total_cost() method.
- */
-class PlacerTimingCosts {
-  public:
-    PlacerTimingCosts() = default;
-
-    PlacerTimingCosts(const ClusteredNetlist& nlist) {
-        auto nets = nlist.nets();
-
-        net_start_indicies_.resize(nets.size());
-
-        //Walk through the netlist to determine how many connections there are.
-        size_t iconn = 0;
-        for (ClusterNetId net : nets) {
-            //The placer always skips 'ignored' nets, so they don't affect timing
-            //costs, so we also skip them here
-            if (nlist.net_is_ignored(net)) {
-                net_start_indicies_[net] = OPEN;
-                continue;
-            }
-
-            //Save the startind index of the current net's connections.
-            // We use a -1 offset, since sinks indexed from [1..num_net_pins-1]
-            // (there is no timing cost associated with net drivers)
-            net_start_indicies_[net] = iconn - 1;
-
-            //Reserve space for all this net's connections
-            iconn += nlist.net_sinks(net).size();
-        }
-
-        size_t num_connections = iconn;
-
-        //Determine how many binary tree levels we need to have a leaf
-        //for each connection cost
-        size_t ilevel = 0;
-        while (num_nodes_in_level(ilevel) < num_connections) {
-            ++ilevel;
-        }
-        num_levels_ = ilevel + 1;
-
-        size_t num_leaves = num_nodes_in_level(ilevel);
-        size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1);
-
-        VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections");
-        VTR_ASSERT_MSG(
-            num_connections == 0 || num_level_before_leaves < num_connections,
-            "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
-
-        //We don't need to store all possible leaves if we have fewer connections
-        //(i.e. bottom-right of tree is empty)
-        size_t last_level_unused_nodes = num_nodes_in_level(ilevel) - num_connections;
-        size_t num_nodes = num_nodes_up_to_level(ilevel) - last_level_unused_nodes;
-
-        //Reserve space for connection costs and intermediate node values
-        connection_costs_ = std::vector<double>(num_nodes, std::numeric_limits<double>::quiet_NaN());
-
-        //The net start indicies we calculated earlier didn't account for intermediate binary tree nodes
-        //Shift the start indicies after the intermediate nodes
-        size_t num_intermediate_nodes = num_nodes_up_to_level(ilevel - 1);
-        for (ClusterNetId net : nets) {
-            if (nlist.net_is_ignored(net)) continue;
-
-            net_start_indicies_[net] = net_start_indicies_[net] + num_intermediate_nodes;
-        }
-    }
-
-    /**
-     * @brief Proxy class representing a connection cost.
-     *
-     * Supports modification of connection cost while detecting
-     * changes and reporting them up to PlacerTimingCosts.
-     */
-    class ConnectionProxy {
-      public:
-        ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost)
-            : timing_costs_(timing_costs)
-            , connection_cost_(connection_cost) {}
-
-        ///@brief Allow clients to modify the connection cost via assignment.
-        ConnectionProxy& operator=(double new_cost) {
-            if (new_cost != connection_cost_) {
-                //If connection cost changed, update it, and mark it
-                //as invalidated
-                connection_cost_ = new_cost;
-                timing_costs_->invalidate(&connection_cost_);
-            }
-            return *this;
-        }
-
-        /**
-         * @brief Support getting the current connection cost as a double.
-         *
-         * Useful for client code operating on the cost values (e.g. difference between costs).
-         */
-        operator double() const {
-            return connection_cost_;
-        }
-
-      private:
-        PlacerTimingCosts* timing_costs_;
-        double& connection_cost_;
-    };
-
-    /**
-     * @brief Proxy class representing the connection costs of a net.
-     *
-     * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection.
-     */
-    class NetProxy {
-      public:
-        NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs)
-            : timing_costs_(timing_costs)
-            , net_sink_costs_(net_sink_costs) {}
-
-        ///@brief Indexes into the specific net pin/connection.
-        ConnectionProxy operator[](size_t ipin) {
-            return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]);
-        }
-
-        const ConnectionProxy operator[](size_t ipin) const {
-            return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]);
-        }
-
-      private:
-        PlacerTimingCosts* timing_costs_;
-        double* net_sink_costs_;
-    };
-
-    ///@brief Indexes into the specific net.
-    NetProxy operator[](ClusterNetId net_id) {
-        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
-
-        double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
-        return NetProxy(this, net_connection_costs);
-    }
-
-    NetProxy operator[](ClusterNetId net_id) const {
-        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
-
-        const double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
-        return NetProxy(const_cast<PlacerTimingCosts*>(this), const_cast<double*>(net_connection_costs));
-    }
-
-    void clear() {
-        connection_costs_.clear();
-        net_start_indicies_.clear();
-    }
-
-    void swap(PlacerTimingCosts& other) {
-        std::swap(connection_costs_, other.connection_costs_);
-        std::swap(net_start_indicies_, other.net_start_indicies_);
-        std::swap(num_levels_, other.num_levels_);
-    }
-
-    /**
-     * @brief Calculates the total cost of all connections efficiently
-     *        in the face of modified connection costs.
-     */
-    double total_cost() {
-        float cost = total_cost_recurr(0); //Root
-
-        VTR_ASSERT_DEBUG_MSG(cost == total_cost_from_scratch(0),
-                             "Expected incremental and from-scratch costs to be consistent");
-
-        return cost;
-    }
-
-  private:
-    ///@brief Recursively calculate and update the timing cost rooted at inode.
-    double total_cost_recurr(size_t inode) {
-        //Prune out-of-tree
-        if (inode > connection_costs_.size() - 1) {
-            return 0.;
-        }
-
-        //Valid pre-calculated intermediate result or valid leaf
-        if (!std::isnan(connection_costs_[inode])) {
-            return connection_costs_[inode];
-        }
-
-        //Recompute recursively
-        double node_cost = total_cost_recurr(left_child(inode))
-                           + total_cost_recurr(right_child(inode));
-
-        //Save intermediate cost at this node
-        connection_costs_[inode] = node_cost;
-
-        return node_cost;
-    }
-
-    double total_cost_from_scratch(size_t inode) const {
-        //Prune out-of-tree
-        if (inode > connection_costs_.size() - 1) {
-            return 0.;
-        }
-
-        //Recompute recursively
-        double node_cost = total_cost_from_scratch(left_child(inode))
-                           + total_cost_from_scratch(right_child(inode));
-
-        return node_cost;
-    }
-
-    ///@brief Friend-ed so it can call invalidate().
-    friend ConnectionProxy;
-
-    void invalidate(const double* invalidated_cost) {
-        //Check pointer within range of internal storage
-        VTR_ASSERT_SAFE_MSG(
-            invalidated_cost >= &connection_costs_[0],
-            "Connection cost pointer should be after start of internal storage");
-
-        VTR_ASSERT_SAFE_MSG(
-            invalidated_cost <= &connection_costs_[connection_costs_.size() - 1],
-            "Connection cost pointer should be before end of internal storage");
-
-        size_t icost = invalidated_cost - &connection_costs_[0];
-
-        VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2));
-
-        //Invalidate parent intermediate costs up to root or first
-        //already-invalidated parent
-        size_t iparent = parent(icost);
-
-        while (!std::isnan(connection_costs_[iparent])) {
-            //Invalidate
-            connection_costs_[iparent] = std::numeric_limits<double>::quiet_NaN();
-
-            if (iparent == 0) {
-                break; //At root
-            } else {
-                //Next parent
-                iparent = parent(iparent);
-            }
-        }
-
-        VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root");
-    }
-
-    size_t left_child(size_t i) const {
-        return 2 * i + 1;
-    }
-
-    size_t right_child(size_t i) const {
-        return 2 * i + 2;
-    }
-
-    size_t parent(size_t i) const {
-        return (i - 1) / 2;
-    }
-
-    /**
-     * @brief Returns the number of nodes in ilevel'th level.
-     *
-     * If ilevel is negative, return 0, since the root shouldn't
-     * be counted as a leaf node candidate.
-     */
-    size_t num_nodes_in_level(int ilevel) const {
-        return ilevel < 0 ? 0 : (2 << (ilevel));
-    }
-
-    ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive).
-    size_t num_nodes_up_to_level(int ilevel) const {
-        return (2 << (ilevel + 1)) - 1;
-    }
-
-  private:
-    /**
-     * @brief Vector storing the implicit binary tree of connection costs.
-     *
-     * The actual connections are stored at the end of the vector
-     * (last level of the binary tree). The earlier portions of
-     * the tree are the intermediate nodes.
-     *
-     * The methods left_child()/right_child()/parent() can be used
-     * to traverse the tree by indices into this vector.
-     */
-    std::vector<double> connection_costs_;
-
-    /**
-     * @brief Vector storing the indices of the first connection
-     *        for each net in the netlist, used for indexing by net.
-     */
-    vtr::vector<ClusterNetId, int> net_start_indicies_;
 
-    ///@brief Number of levels in the binary tree.
-    size_t num_levels_ = 0;
-};

From 7d7d488d24650447d921c12f9530ff32b354bf40 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Mon, 2 Dec 2024 12:07:18 -0500
Subject: [PATCH 21/39] remove timing_place.h

---
 vpr/src/base/place_and_route.cpp              | 16 ----
 vpr/src/base/read_route.cpp                   |  3 +-
 vpr/src/place/analytic_placer.h               |  1 -
 .../critical_uniform_move_generator.cpp       |  2 +
 .../critical_uniform_move_generator.h         |  1 -
 .../feasible_region_move_generator.h          |  3 +-
 .../weighted_median_move_generator.h          |  1 -
 vpr/src/place/move_utils.cpp                  |  2 +-
 vpr/src/place/net_cost_handler.h              |  1 -
 vpr/src/place/placer.h                        |  1 -
 vpr/src/place/placer_state.h                  |  2 +-
 vpr/src/place/timing/PlacerCriticalities.h    | 17 ++++
 vpr/src/place/timing/PlacerTimingCosts.cpp    | 76 +++++++++++++++--
 vpr/src/place/timing/PlacerTimingCosts.h      | 81 +++----------------
 vpr/src/place/timing/timing_place.h           | 38 ---------
 15 files changed, 104 insertions(+), 141 deletions(-)
 delete mode 100644 vpr/src/place/timing/timing_place.h

diff --git a/vpr/src/base/place_and_route.cpp b/vpr/src/base/place_and_route.cpp
index 2ffeb26c240..7074d34662a 100644
--- a/vpr/src/base/place_and_route.cpp
+++ b/vpr/src/base/place_and_route.cpp
@@ -1,14 +1,9 @@
-#include <sys/types.h>
 
 #include <cstdio>
-#include <ctime>
-#include <climits>
 #include <cstdlib>
 #include <cmath>
 #include <algorithm>
 
-#include "vtr_util.h"
-#include "vtr_memory.h"
 #include "vtr_assert.h"
 #include "vtr_log.h"
 
@@ -16,7 +11,6 @@
 #include "vpr_utils.h"
 #include "vpr_error.h"
 #include "globals.h"
-#include "atom_netlist.h"
 #include "place_and_route.h"
 #include "place.h"
 #include "read_place.h"
@@ -24,21 +18,11 @@
 #include "route.h"
 #include "route_export.h"
 #include "draw.h"
-#include "stats.h"
-#include "check_route.h"
 #include "rr_graph.h"
-#include "net_delay.h"
-#include "timing_place.h"
 #include "read_xml_arch_file.h"
-#include "echo_files.h"
 #include "route_common.h"
-#include "place_macro.h"
-#include "power.h"
-#include "place_util.h"
 
 #include "RoutingDelayCalculator.h"
-#include "timing_info.h"
-#include "tatum/echo_writer.hpp"
 
 /******************* Subroutines local to this module ************************/
 
diff --git a/vpr/src/base/read_route.cpp b/vpr/src/base/read_route.cpp
index d2d3bc14d54..6ac9d099c4b 100644
--- a/vpr/src/base/read_route.cpp
+++ b/vpr/src/base/read_route.cpp
@@ -39,12 +39,12 @@
 #include "vpr_utils.h"
 #include "vpr_error.h"
 #include "place_and_route.h"
-#include "timing_place.h"
 #include "route_export.h"
 #include "echo_files.h"
 #include "route_common.h"
 #include "route_tree.h"
 #include "read_route.h"
+#include "d_ary_heap.h"
 
 #include "old_traceback.h"
 
@@ -212,7 +212,6 @@ static void process_nets(const Netlist<>& net_list, std::ifstream& fp, ClusterNe
         process_nodes(net_list, fp, inet, filename, lineno);
     }
     input_tokens.clear();
-    return;
 }
 
 static void process_nodes(const Netlist<>& net_list, std::ifstream& fp, ClusterNetId inet, const char* filename, int& lineno) {
diff --git a/vpr/src/place/analytic_placer.h b/vpr/src/place/analytic_placer.h
index b73b3486f57..b279b82e058 100644
--- a/vpr/src/place/analytic_placer.h
+++ b/vpr/src/place/analytic_placer.h
@@ -83,7 +83,6 @@
  */
 
 #    include "vpr_context.h"
-#    include "timing_place.h"
 #    include "PlacementDelayCalculator.h"
 
 /*
diff --git a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
index ab1039ae3d0..7d36889c2f6 100644
--- a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
+++ b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp
@@ -1,4 +1,6 @@
+
 #include "critical_uniform_move_generator.h"
+
 #include "globals.h"
 #include "place_constraints.h"
 #include "placer_state.h"
diff --git a/vpr/src/place/move_generators/critical_uniform_move_generator.h b/vpr/src/place/move_generators/critical_uniform_move_generator.h
index dd4e5391474..68358552668 100644
--- a/vpr/src/place/move_generators/critical_uniform_move_generator.h
+++ b/vpr/src/place/move_generators/critical_uniform_move_generator.h
@@ -1,7 +1,6 @@
 #ifndef VPR_CRITICAL_UNIFORM_MOVE_GEN_H
 #define VPR_CRITICAL_UNIFORM_MOVE_GEN_H
 #include "move_generator.h"
-#include "timing_place.h"
 
 /**
  * @file 
diff --git a/vpr/src/place/move_generators/feasible_region_move_generator.h b/vpr/src/place/move_generators/feasible_region_move_generator.h
index 702f8bdd26c..75304a60fd6 100644
--- a/vpr/src/place/move_generators/feasible_region_move_generator.h
+++ b/vpr/src/place/move_generators/feasible_region_move_generator.h
@@ -1,10 +1,9 @@
 #ifndef VPR_FEASIBLE_REGION_MOVE_GEN_H
 #define VPR_FEASIBLE_REGION_MOVE_GEN_H
 #include "move_generator.h"
-#include "timing_place.h"
 
 /**
- * @brief Feasible Reion (FR) move genrator
+ * @brief Feasible Region (FR) move generator
  *
  * This move was originally defined by Chen et al . in "Simultaneous timing-driven placement and duplication", FPGA 2005
  *
diff --git a/vpr/src/place/move_generators/weighted_median_move_generator.h b/vpr/src/place/move_generators/weighted_median_move_generator.h
index a6041f13e87..7da4be46bf6 100644
--- a/vpr/src/place/move_generators/weighted_median_move_generator.h
+++ b/vpr/src/place/move_generators/weighted_median_move_generator.h
@@ -2,7 +2,6 @@
 #define VPR_WEIGHTED_MEDIAN_MOVE_GEN_H
 
 #include "move_generator.h"
-#include "timing_place.h"
 
 /**
  * @brief The weighted median move generator
diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index 78623200f42..bab61cd0f6d 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -712,7 +712,7 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
     //
     //Note that the range limit (rlim) is applied in a logical sense (i.e. 'compressed' grid space consisting
     //of the same block types, and not the physical grid space). This means, for example, that columns of 'rare'
-    //blocks (e.g. DSPs/RAMs) which are physically far appart but logically adjacent will be swappable even
+    //blocks (e.g. DSPs/RAMs) which are physically far apart but logically adjacent will be swappable even
     //at an rlim fo 1.
     //
     //This ensures that such blocks don't get locked down too early during placement (as would be the
diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h
index 6436265dbda..9fad2757681 100644
--- a/vpr/src/place/net_cost_handler.h
+++ b/vpr/src/place/net_cost_handler.h
@@ -7,7 +7,6 @@
 #pragma once
 
 #include "place_delay_model.h"
-#include "timing_place.h"
 #include "move_transactions.h"
 #include "place_util.h"
 #include "vtr_ndoffsetmatrix.h"
diff --git a/vpr/src/place/placer.h b/vpr/src/place/placer.h
index 11924314c8b..3fb89fb20f3 100644
--- a/vpr/src/place/placer.h
+++ b/vpr/src/place/placer.h
@@ -20,7 +20,6 @@
 #include <memory>
 #include <optional>
 
-#include "timing_place.h"
 #include "place_checkpoint.h"
 #include "PlacementDelayCalculator.h"
 #include "placer_state.h"
diff --git a/vpr/src/place/placer_state.h b/vpr/src/place/placer_state.h
index 35f1ec73766..a6896a359e8 100644
--- a/vpr/src/place/placer_state.h
+++ b/vpr/src/place/placer_state.h
@@ -12,7 +12,7 @@
 #include "vpr_context.h"
 #include "vpr_net_pins_matrix.h"
 #include "vpr_types.h"
-#include "timing_place.h"
+#include "PlacerTimingCosts.h"
 
 /**
  * @brief State relating to the timing driven data.
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index 4a6c5518eb2..161423dba6a 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -57,6 +57,23 @@ struct PlaceCritParams {
  * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same
  * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated
  * from scratch, since a change in exponent changes *all* criticalities.
+ *
+ * Calculating criticalities:
+ * All the raw setup slack values across a single clock domain are gathered
+ * and rated from the best to the worst in terms of criticalities. In order
+ * to calculate criticalities, all the slack values need to be non-negative.
+ * Hence, if the worst slack is negative, all the slack values are shifted
+ * by the value of the worst slack so that the value is at least 0. If the
+ * worst slack is positive, then no shift happens.
+ *
+ * The best (shifted) slack (the most positive one) will have a criticality of 0.
+ * The worst (shifted) slack value will have a criticality of 1.
+ *
+ * Criticalities are used to calculated timing costs for each connection.
+ * The formula is cost = delay * criticality.
+ *
+ * For a more detailed description on how criticalities are calculated, see
+ * calc_relaxed_criticality() in `timing_util.cpp`.
  */
 class PlacerCriticalities {
   public: //Types
diff --git a/vpr/src/place/timing/PlacerTimingCosts.cpp b/vpr/src/place/timing/PlacerTimingCosts.cpp
index c7fe35fc020..d8ad6afafab 100644
--- a/vpr/src/place/timing/PlacerTimingCosts.cpp
+++ b/vpr/src/place/timing/PlacerTimingCosts.cpp
@@ -1,11 +1,10 @@
 
 #include "PlacerTimingCosts.h"
 
-
 PlacerTimingCosts::PlacerTimingCosts(const ClusteredNetlist& nlist) {
     auto nets = nlist.nets();
 
-    net_start_indicies_.resize(nets.size());
+    net_start_indices_.resize(nets.size());
 
     // Walk through the netlist to determine how many connections there are.
     size_t iconn = 0;
@@ -13,14 +12,14 @@ PlacerTimingCosts::PlacerTimingCosts(const ClusteredNetlist& nlist) {
         // The placer always skips 'ignored' nets, so they don't affect timing
         // costs, so we also skip them here
         if (nlist.net_is_ignored(net)) {
-            net_start_indicies_[net] = OPEN;
+            net_start_indices_[net] = OPEN;
             continue;
         }
 
         // Save the starting index of the current net's connections.
         // We use a -1 offset, since sinks indexed from [1..num_net_pins-1]
         // (there is no timing cost associated with net drivers)
-        net_start_indicies_[net] = iconn - 1;
+        net_start_indices_[net] = iconn - 1;
 
         // Reserve space for all this net's connections
         iconn += nlist.net_sinks(net).size();
@@ -55,6 +54,73 @@ PlacerTimingCosts::PlacerTimingCosts(const ClusteredNetlist& nlist) {
     for (ClusterNetId net : nets) {
         if (nlist.net_is_ignored(net)) continue;
 
-        net_start_indicies_[net] = net_start_indicies_[net] + num_intermediate_nodes;
+        net_start_indices_[net] = net_start_indices_[net] + num_intermediate_nodes;
+    }
+}
+
+double PlacerTimingCosts::total_cost_recurr(size_t inode) {
+    // Prune out-of-tree
+    if (inode > connection_costs_.size() - 1) {
+        return 0.;
+    }
+
+    //Valid pre-calculated intermediate result or valid leaf
+    if (!std::isnan(connection_costs_[inode])) {
+        return connection_costs_[inode];
+    }
+
+    //Recompute recursively
+    double node_cost = total_cost_recurr(left_child(inode))
+                       + total_cost_recurr(right_child(inode));
+
+    //Save intermediate cost at this node
+    connection_costs_[inode] = node_cost;
+
+    return node_cost;
+}
+
+double PlacerTimingCosts::total_cost_from_scratch(size_t inode) const {
+    // Prune out-of-tree
+    if (inode > connection_costs_.size() - 1) {
+        return 0.;
     }
+
+    //Recompute recursively
+    double node_cost = total_cost_from_scratch(left_child(inode))
+                       + total_cost_from_scratch(right_child(inode));
+
+    return node_cost;
 }
+
+void PlacerTimingCosts::invalidate(const double* invalidated_cost) {
+    //Check pointer within range of internal storage
+    VTR_ASSERT_SAFE_MSG(
+        invalidated_cost >= &connection_costs_[0],
+        "Connection cost pointer should be after start of internal storage");
+
+    VTR_ASSERT_SAFE_MSG(
+        invalidated_cost <= &connection_costs_[connection_costs_.size() - 1],
+        "Connection cost pointer should be before end of internal storage");
+
+    size_t icost = invalidated_cost - &connection_costs_[0];
+
+    VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2));
+
+    //Invalidate parent intermediate costs up to root or first
+    //already-invalidated parent
+    size_t iparent = parent(icost);
+
+    while (!std::isnan(connection_costs_[iparent])) {
+        //Invalidate
+        connection_costs_[iparent] = std::numeric_limits<double>::quiet_NaN();
+
+        if (iparent == 0) {
+            break; //At root
+        } else {
+            //Next parent
+            iparent = parent(iparent);
+        }
+    }
+
+    VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root");
+}
\ No newline at end of file
diff --git a/vpr/src/place/timing/PlacerTimingCosts.h b/vpr/src/place/timing/PlacerTimingCosts.h
index f84f4446466..67523b7de66 100644
--- a/vpr/src/place/timing/PlacerTimingCosts.h
+++ b/vpr/src/place/timing/PlacerTimingCosts.h
@@ -143,27 +143,27 @@ class PlacerTimingCosts {
 
     ///@brief Indexes into the specific net.
     NetProxy operator[](ClusterNetId net_id) {
-        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
+        VTR_ASSERT_SAFE(net_start_indices_[net_id] >= 0);
 
-        double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
+        double* net_connection_costs = &connection_costs_[net_start_indices_[net_id]];
         return NetProxy(this, net_connection_costs);
     }
 
     NetProxy operator[](ClusterNetId net_id) const {
-        VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
+        VTR_ASSERT_SAFE(net_start_indices_[net_id] >= 0);
 
-        const double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]];
+        const double* net_connection_costs = &connection_costs_[net_start_indices_[net_id]];
         return NetProxy(const_cast<PlacerTimingCosts*>(this), const_cast<double*>(net_connection_costs));
     }
 
     void clear() {
         connection_costs_.clear();
-        net_start_indicies_.clear();
+        net_start_indices_.clear();
     }
 
     void swap(PlacerTimingCosts& other) {
         std::swap(connection_costs_, other.connection_costs_);
-        std::swap(net_start_indicies_, other.net_start_indicies_);
+        std::swap(net_start_indices_, other.net_start_indices_);
         std::swap(num_levels_, other.num_levels_);
     }
 
@@ -182,75 +182,14 @@ class PlacerTimingCosts {
 
   private:
     ///@brief Recursively calculate and update the timing cost rooted at inode.
-    double total_cost_recurr(size_t inode) {
-        //Prune out-of-tree
-        if (inode > connection_costs_.size() - 1) {
-            return 0.;
-        }
-
-        //Valid pre-calculated intermediate result or valid leaf
-        if (!std::isnan(connection_costs_[inode])) {
-            return connection_costs_[inode];
-        }
+    double total_cost_recurr(size_t inode);
 
-        //Recompute recursively
-        double node_cost = total_cost_recurr(left_child(inode))
-                           + total_cost_recurr(right_child(inode));
-
-        //Save intermediate cost at this node
-        connection_costs_[inode] = node_cost;
-
-        return node_cost;
-    }
-
-    double total_cost_from_scratch(size_t inode) const {
-        //Prune out-of-tree
-        if (inode > connection_costs_.size() - 1) {
-            return 0.;
-        }
-
-        //Recompute recursively
-        double node_cost = total_cost_from_scratch(left_child(inode))
-                           + total_cost_from_scratch(right_child(inode));
-
-        return node_cost;
-    }
+    double total_cost_from_scratch(size_t inode) const;
 
     ///@brief Friend-ed so it can call invalidate().
     friend ConnectionProxy;
 
-    void invalidate(const double* invalidated_cost) {
-        //Check pointer within range of internal storage
-        VTR_ASSERT_SAFE_MSG(
-            invalidated_cost >= &connection_costs_[0],
-            "Connection cost pointer should be after start of internal storage");
-
-        VTR_ASSERT_SAFE_MSG(
-            invalidated_cost <= &connection_costs_[connection_costs_.size() - 1],
-            "Connection cost pointer should be before end of internal storage");
-
-        size_t icost = invalidated_cost - &connection_costs_[0];
-
-        VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2));
-
-        //Invalidate parent intermediate costs up to root or first
-        //already-invalidated parent
-        size_t iparent = parent(icost);
-
-        while (!std::isnan(connection_costs_[iparent])) {
-            //Invalidate
-            connection_costs_[iparent] = std::numeric_limits<double>::quiet_NaN();
-
-            if (iparent == 0) {
-                break; //At root
-            } else {
-                //Next parent
-                iparent = parent(iparent);
-            }
-        }
-
-        VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root");
-    }
+    void invalidate(const double* invalidated_cost);
 
     size_t left_child(size_t i) const {
         return 2 * i + 1;
@@ -296,7 +235,7 @@ class PlacerTimingCosts {
      * @brief Vector storing the indices of the first connection
      *        for each net in the netlist, used for indexing by net.
      */
-    vtr::vector<ClusterNetId, int> net_start_indicies_;
+    vtr::vector<ClusterNetId, int> net_start_indices_;
 
     ///@brief Number of levels in the binary tree.
     size_t num_levels_ = 0;
diff --git a/vpr/src/place/timing/timing_place.h b/vpr/src/place/timing/timing_place.h
deleted file mode 100644
index 54641947803..00000000000
--- a/vpr/src/place/timing/timing_place.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * @file timing_place.h
- * @brief Interface used by the VPR placer to query information
- *        from the Tatum timing analyzer.
- *
- *   @class PlacerSetupSlacks
- *              Queries connection **RAW** setup slacks, which can
- *              range from negative to positive values. Also maps
- *              atom pin setup slacks to clb pin setup slacks.
- *   @class PlacerCriticalities
- *              Query connection criticalities, which are calculated
- *              based on the raw setup slacks and ranges from 0 to 1.
- *              Also maps atom pin crit. to clb pin crit.
- *   @class PlacerTimingCosts
- *              Hierarchical structure used by update_td_costs() to
- *              maintain the order of addition operation of float values
- *              (to avoid round-offs) while doing incremental updates.
- *
- * Calculating criticalities:
- *      All the raw setup slack values across a single clock domain are gathered
- *      and rated from the best to the worst in terms of criticalities. In order
- *      to calculate criticalities, all the slack values need to be non-negative.
- *      Hence, if the worst slack is negative, all the slack values are shifted
- *      by the value of the worst slack so that the value is at least 0. If the
- *      worst slack is positive, then no shift happens.
- *
- *      The best (shifted) slack (the most positive one) will have a criticality of 0.
- *      The worst (shifted) slack value will have a criticality of 1.
- *
- *      Criticalities are used to calculated timing costs for each connection.
- *      The formula is cost = delay * criticality.
- *
- *      For a more detailed description on how criticalities are calculated, see
- *      calc_relaxed_criticality() in `timing_util.cpp`.
- */
-
-
-

From 246498d610d065a4d7ce8fcf991dd77a225dfefa Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 5 Dec 2024 16:06:40 -0500
Subject: [PATCH 22/39] make some methods static in PlacerTimingCosts

---
 vpr/src/place/timing/PlacerTimingCosts.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpr/src/place/timing/PlacerTimingCosts.h b/vpr/src/place/timing/PlacerTimingCosts.h
index 67523b7de66..5e1415581c3 100644
--- a/vpr/src/place/timing/PlacerTimingCosts.h
+++ b/vpr/src/place/timing/PlacerTimingCosts.h
@@ -191,15 +191,15 @@ class PlacerTimingCosts {
 
     void invalidate(const double* invalidated_cost);
 
-    size_t left_child(size_t i) const {
+    static size_t left_child(size_t i) {
         return 2 * i + 1;
     }
 
-    size_t right_child(size_t i) const {
+    static size_t right_child(size_t i) {
         return 2 * i + 2;
     }
 
-    size_t parent(size_t i) const {
+    static size_t parent(size_t i) {
         return (i - 1) / 2;
     }
 
@@ -209,12 +209,12 @@ class PlacerTimingCosts {
      * If ilevel is negative, return 0, since the root shouldn't
      * be counted as a leaf node candidate.
      */
-    size_t num_nodes_in_level(int ilevel) const {
+    static size_t num_nodes_in_level(int ilevel) {
         return ilevel < 0 ? 0 : (2 << (ilevel));
     }
 
     ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive).
-    size_t num_nodes_up_to_level(int ilevel) const {
+    static size_t num_nodes_up_to_level(int ilevel) {
         return (2 << (ilevel + 1)) - 1;
     }
 

From d579250abf88c0c7f4180decc045b0298b4e685a Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 5 Dec 2024 16:11:48 -0500
Subject: [PATCH 23/39] delete PlacementDelayModelCreator's constructor

---
 .../place/timing/delay_model/PlacementDelayModelCreator.h    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
index 37a8e0d51c8..c92b67d4854 100644
--- a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
+++ b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
@@ -16,9 +16,8 @@ struct t_direct_inf;
 
 class PlacementDelayModelCreator {
   public:
-    // nothing to do in the constructor and destructor
-    PlacementDelayModelCreator() = default;
-    ~PlacementDelayModelCreator() = default;
+    // nothing to do in the constructor
+    PlacementDelayModelCreator() = delete;
 
     static std::unique_ptr<PlaceDelayModel> create_delay_model(const t_placer_opts& placer_opts,
                                                                const t_router_opts& router_opts,

From 864bd282676157a692bbdc1cd86c6629d04264f8 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 5 Dec 2024 16:58:20 -0500
Subject: [PATCH 24/39] remove one of the signatures of pick_from_block and
 pick_from_highly_critical_block

---
 libs/libarchfpga/src/physical_types_util.h |  2 +-
 vpr/src/place/move_utils.cpp               | 93 ++++------------------
 vpr/src/place/move_utils.h                 | 17 ----
 3 files changed, 18 insertions(+), 94 deletions(-)

diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h
index d4d5dc55924..a081683faeb 100644
--- a/libs/libarchfpga/src/physical_types_util.h
+++ b/libs/libarchfpga/src/physical_types_util.h
@@ -212,7 +212,7 @@ int get_logical_block_physical_sub_tile_index(t_physical_tile_type_ptr physical_
                                               t_logical_block_type_ptr logical_block);
 /**
  * @brief Returns the physical pin index (within 'physical_tile') corresponding to the
- * logical index ('pin' of the first instance of 'logical_block' within the physcial tile.
+ * logical index ('pin' of the first instance of 'logical_block' within the physical tile.
  *
  * This function is called before/during placement, when a sub tile index was not yet assigned.
  *
diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index bab61cd0f6d..601d2dea852 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -552,26 +552,19 @@ ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts,
                                      int* pin_from,
                                      const PlacerState& placer_state,
                                      vtr::RngContainer& rng) {
-    ClusterBlockId b_from = ClusterBlockId::INVALID();
     const auto& cluster_ctx = g_vpr_ctx.clustering();
 
-    if (logical_blk_type_index == -1) { //If the block type is unspecified, choose any random block to be swapped with another random block
-        if (highly_crit_block) {
-            b_from = pick_from_highly_critical_block(*net_from, *pin_from, placer_state, *placer_criticalities, rng);
-        } else {
-            b_from = pick_from_block(rng);
-        }
+    ClusterBlockId b_from = ClusterBlockId::INVALID();
 
-        //if a movable block found, set the block type
-        if (b_from) {
-            logical_blk_type_index = cluster_ctx.clb_nlist.block_type(b_from)->index;
-        }
-    } else { //If the block type is specified, choose a random block with blk_type to be swapped with another random block
-        if (highly_crit_block) {
-            b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, *placer_criticalities, rng);
-        } else {
-            b_from = pick_from_block(logical_blk_type_index, rng);
-        }
+    if (highly_crit_block) {
+        b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, *placer_criticalities, rng);
+    } else {
+        b_from = pick_from_block(logical_blk_type_index, rng);
+    }
+
+    //if a movable block found, set the block type
+    if (b_from) {
+        logical_blk_type_index = cluster_ctx.clb_nlist.block_type(b_from)->index;
     }
 
     if constexpr (VTR_ENABLE_DEBUG_LOGGING_CONST_EXPR) {
@@ -590,71 +583,20 @@ const std::vector<ClusterBlockId>& movable_blocks_per_type(const t_logical_block
     return place_ctx.movable_blocks_per_type[blk_type.index];
 }
 
-ClusterBlockId pick_from_block(vtr::RngContainer& rng) {
-    auto& place_ctx = g_vpr_ctx.placement();
-
-    // get the number of movable clustered blocks
-    const size_t n_movable_blocks = place_ctx.movable_blocks.size();
-
-    if (n_movable_blocks > 0) {
-        //Pick a movable block at random and return it
-        auto b_from = ClusterBlockId(rng.irand((int)n_movable_blocks - 1));
-        return b_from;
-    } else {
-        //No movable blocks found
-        return ClusterBlockId::INVALID();
-    }
-}
-
 ClusterBlockId pick_from_block(const int logical_blk_type_index, vtr::RngContainer& rng) {
-    auto& place_ctx = g_vpr_ctx.placement();
-
-    const auto& movable_blocks_of_type = place_ctx.movable_blocks_per_type[logical_blk_type_index];
-
-    if (movable_blocks_of_type.empty()) {
-        return ClusterBlockId::INVALID();
-    }
-
-    auto b_from = ClusterBlockId(movable_blocks_of_type[rng.irand((int)movable_blocks_of_type.size() - 1)]);
-
-    return b_from;
-}
-
-//Pick a random highly critical block to be swapped with another random block.
-//If none is found return ClusterBlockId::INVALID()
-ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
-                                               int& pin_from,
-                                               const PlacerState& placer_state,
-                                               const PlacerCriticalities& placer_criticalities,
-                                               vtr::RngContainer& rng) {
-    const auto& cluster_ctx = g_vpr_ctx.clustering();
-    const auto& block_locs = placer_state.block_locs();
-
-    //Initialize critical net and pin to be invalid
-    net_from = ClusterNetId::INVALID();
-    pin_from = -1;
+    const auto& place_ctx = g_vpr_ctx.placement();
 
-    const auto& highly_crit_pins = placer_criticalities.get_highly_critical_pins();
+    // if logical block type is specified, pick the 'from' block from block of that type; otherwise,
+    // pick it from all blocks
+    const auto& movable_blocks = (logical_blk_type_index < 0 )? place_ctx.movable_blocks : place_ctx.movable_blocks_per_type[logical_blk_type_index];
 
-    //check if any critical block is available
-    if (highly_crit_pins.empty()) {
+    if (movable_blocks.empty()) {
         return ClusterBlockId::INVALID();
     }
 
-    //pick a random highly critical pin and find the nets driver block
-    std::pair<ClusterNetId, int> crit_pin = highly_crit_pins[rng.irand(highly_crit_pins.size() - 1)];
-    ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first);
-
-    if (block_locs[b_from].is_fixed) {
-        return ClusterBlockId::INVALID(); //Block is fixed, cannot move
-    }
+    ClusterBlockId b_from = movable_blocks[rng.irand((int)movable_blocks.size() - 1)];
 
-    net_from = crit_pin.first;
-    pin_from = crit_pin.second;
     return b_from;
-
-    //Unreachable statement
-    return ClusterBlockId::INVALID();
 }
 
 //Pick a random highly critical block with a specified block type to be swapped with another random block.
@@ -686,7 +628,7 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
     //Check if picked block type matches with the blk_type specified, and it is not fixed
     //blk_type from propose move doesn't account for the EMPTY type
     auto b_from_type = cluster_ctx.clb_nlist.block_type(b_from);
-    if (b_from_type->index == logical_blk_type_index) {
+    if (b_from_type->index == logical_blk_type_index || logical_blk_type_index < 0) {
         if (block_locs[b_from].is_fixed) {
             return ClusterBlockId::INVALID(); //Block is fixed, cannot move
         }
@@ -697,7 +639,6 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
     }
 
     //No critical block with 'blk_type' found
-    //Unreachable statement
     return ClusterBlockId::INVALID();
 }
 
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index 1aa5591f5c8..2b3f8de0ce1 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -185,12 +185,6 @@ ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts,
  */
 const std::vector<ClusterBlockId>& movable_blocks_per_type(const t_logical_block_type& blk_type);
 
-/**
- * @brief Select a random block to be swapped with another block
- * 
- * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found
- */
-ClusterBlockId pick_from_block(vtr::RngContainer& rng);
 
 /**
  * @brief Find a block with a specific block type to be swapped with another block
@@ -201,17 +195,6 @@ ClusterBlockId pick_from_block(vtr::RngContainer& rng);
  */
 ClusterBlockId pick_from_block(int logical_blk_type_index, vtr::RngContainer& rng);
 
-/**
- * @brief Select a random highly critical block to be swapped with another block
- * 
- * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found
- */
-ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
-                                               int& pin_from,
-                                               const PlacerState& placer_state,
-                                               const PlacerCriticalities& placer_criticalities,
-                                               vtr::RngContainer& rng);
-
 /**
  * @brief Find a block with a specific block type to be swapped with another block
  *

From e3cad45d6ff576432a3d21157da5f4d539022dc2 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 5 Dec 2024 17:12:52 -0500
Subject: [PATCH 25/39] update comments for pick_from_block and
 pick_from_highly_critical_block

---
 vpr/src/place/move_utils.cpp |  6 ++----
 vpr/src/place/move_utils.h   | 12 ++++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index 601d2dea852..d44c3611eca 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -586,8 +586,8 @@ const std::vector<ClusterBlockId>& movable_blocks_per_type(const t_logical_block
 ClusterBlockId pick_from_block(const int logical_blk_type_index, vtr::RngContainer& rng) {
     const auto& place_ctx = g_vpr_ctx.placement();
 
-    // if logical block type is specified, pick the 'from' block from block of that type; otherwise,
-    // pick it from all blocks
+    // if logical block type is specified, pick the 'from' block from blocks of that type;
+    // otherwise, select it randomly from all blocks
     const auto& movable_blocks = (logical_blk_type_index < 0 )? place_ctx.movable_blocks : place_ctx.movable_blocks_per_type[logical_blk_type_index];
 
     if (movable_blocks.empty()) {
@@ -599,8 +599,6 @@ ClusterBlockId pick_from_block(const int logical_blk_type_index, vtr::RngContain
     return b_from;
 }
 
-//Pick a random highly critical block with a specified block type to be swapped with another random block.
-//If none is found return ClusterBlockId::INVALID()
 ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,
                                                const int logical_blk_type_index,
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index 2b3f8de0ce1..ea9a90cc18d 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -189,18 +189,22 @@ const std::vector<ClusterBlockId>& movable_blocks_per_type(const t_logical_block
 /**
  * @brief Find a block with a specific block type to be swapped with another block
  *
- *  @param logical_blk_type_index: the agent type of the moving block.
+ * @param logical_blk_type_index The logical type of the moving block. If a negative value is passed,
+ * the block is selected randomly from all movable blocks and not from a specific type.
+ * @param rng A random number generator used to select a random block.
  * 
  * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found
  */
 ClusterBlockId pick_from_block(int logical_blk_type_index, vtr::RngContainer& rng);
 
 /**
- * @brief Find a block with a specific block type to be swapped with another block
+ * @brief Find a highly critical block with a specific block type to be swapped with another block.
  *
- *  @param logical_blk_type_index: the agent type of the moving block.
+ * @param logical_blk_type_index The logical type of the moving block. If a negative value is passed,
+ * the block is selected randomly from all movable blocks and not from a specific type.
+ * @param rng A random number generator used to select a random highly critical block.
  * 
- * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found
+ * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found.
  */
 ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
                                                int& pin_from,

From b76b41ec27101be25368a00bc5530bfc50e3ecdf Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Wed, 15 Jan 2025 17:41:21 -0500
Subject: [PATCH 26/39] move  PlacerSetupSlacks::update_setup_slacks() doxygen
 comment from .cpp to .h file

---
 vpr/src/place/timing/PlacerSetupSlacks.cpp | 10 ----------
 vpr/src/place/timing/PlacerSetupSlacks.h   | 13 +++++++++++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/vpr/src/place/timing/PlacerSetupSlacks.cpp b/vpr/src/place/timing/PlacerSetupSlacks.cpp
index 3a097a582ff..18df9ed66d2 100644
--- a/vpr/src/place/timing/PlacerSetupSlacks.cpp
+++ b/vpr/src/place/timing/PlacerSetupSlacks.cpp
@@ -14,16 +14,6 @@ PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist,
     , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
 }
 
-/**
- * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure.
- *
- * If the setup slacks are not updated immediately after each time we call
- * timing_info->update(), then timing_info->pins_with_modified_setup_slack()
- * cannot accurately account for all the pins that need to be updated.
- *
- * In this case, `recompute_required` would be true, and we update all setup slacks
- * from scratch.
- */
 void PlacerSetupSlacks::update_setup_slacks() {
     // If update is not enabled, exit the routine.
     if (!update_enabled) {
diff --git a/vpr/src/place/timing/PlacerSetupSlacks.h b/vpr/src/place/timing/PlacerSetupSlacks.h
index 7ffc450e94b..5248bdebc70 100644
--- a/vpr/src/place/timing/PlacerSetupSlacks.h
+++ b/vpr/src/place/timing/PlacerSetupSlacks.h
@@ -56,10 +56,19 @@ class PlacerSetupSlacks {
      * @brief Updates setup slacks based on the atom netlist setup slacks provided
      *        by timing_info_.
      *
+     *  @note This function updates the setup slacks in the timing_place_setup_slacks_
+     *  data structure.
+     *
      * Should consistently call this method after the most recent timing analysis to
      * keep the setup slacks stored in this class in sync with the timing analyzer.
-     * If out of sync, then the setup slacks cannot be incrementally updated on
-     * during the next timing analysis iteration.
+     * If out of sync, then the setup slacks cannot be incrementally updated during
+     * the next timing analysis iteration.
+     *
+     * If the setup slacks are not updated immediately after each time we cal
+     * timing_info->update(), then timing_info->pins_with_modified_setup_slack()
+     * cannot accurately account for all the pins that need to be updated.
+     * In this case, `recompute_required` would be true, and we update all setup slacks
+     * from scratch.
      */
     void update_setup_slacks();
 

From ec00c1e03eeb4ec4cb6afe769689bf2d681abae6 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Wed, 15 Jan 2025 19:00:28 -0500
Subject: [PATCH 27/39] add comments to pick_from_highly_critical_block()

---
 vpr/src/place/move_utils.cpp | 8 +++++---
 vpr/src/place/move_utils.h   | 6 ++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index d44c3611eca..6e79bdaac4d 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -623,12 +623,14 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from,
     std::pair<ClusterNetId, int> crit_pin = highly_crit_pins[rng.irand(highly_crit_pins.size() - 1)];
     ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first);
 
-    //Check if picked block type matches with the blk_type specified, and it is not fixed
-    //blk_type from propose move doesn't account for the EMPTY type
     auto b_from_type = cluster_ctx.clb_nlist.block_type(b_from);
+
+    // check if the type of the picked block matches with the specified block type
+    // when a block type is specified, i.e. when logical_blk_type_index >= 0
     if (b_from_type->index == logical_blk_type_index || logical_blk_type_index < 0) {
+        // ensure that the selected block is not fixed
         if (block_locs[b_from].is_fixed) {
-            return ClusterBlockId::INVALID(); //Block is fixed, cannot move
+            return ClusterBlockId::INVALID();   // a fixed block can't be moved
         }
 
         net_from = crit_pin.first;
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index ea9a90cc18d..ba93014297a 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -200,8 +200,14 @@ ClusterBlockId pick_from_block(int logical_blk_type_index, vtr::RngContainer& rn
 /**
  * @brief Find a highly critical block with a specific block type to be swapped with another block.
  *
+ * @param net_from The clustered net id of the critical connection of the selected block by this function.
+ * To be filled by this function.
+ * @param pin_from The pin id of the critical connection of the  selected block by this function.
+ * To be filled by this function.
  * @param logical_blk_type_index The logical type of the moving block. If a negative value is passed,
  * the block is selected randomly from all movable blocks and not from a specific type.
+ * @param placer_state Used to access the current placement's info, e.g. block locations and if they are fixed.
+ * @param placer_criticalities Holds the clustered netlist connection criticalities.
  * @param rng A random number generator used to select a random highly critical block.
  * 
  * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found.

From 9933fc6dfc3f3f0997c08f4387a3938b20817e84 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 16 Jan 2025 11:23:48 -0500
Subject: [PATCH 28/39] make a paragraph in the big picture comment of
 PlacerCriticalities more clear

---
 vpr/src/place/timing/PlacerCriticalities.cpp | 10 ----------
 vpr/src/place/timing/PlacerCriticalities.h   | 21 +++++++++++++-------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/vpr/src/place/timing/PlacerCriticalities.cpp b/vpr/src/place/timing/PlacerCriticalities.cpp
index 1f2e4f518e9..4cbf1ec66ec 100644
--- a/vpr/src/place/timing/PlacerCriticalities.cpp
+++ b/vpr/src/place/timing/PlacerCriticalities.cpp
@@ -13,16 +13,6 @@ PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist,
     , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
 }
 
-/**
- * @brief Updated the criticalities in the timing_place_crit_ data structure.
- *
- * If the criticalities are not updated immediately after each time we call
- * timing_info->update(), then timing_info->pins_with_modified_setup_criticality()
- * cannot accurately account for all the pins that need to be updated. In this case,
- * `recompute_required` would be true, and we update all criticalities from scratch.
- *
- * If the criticality exponent has changed, we also need to update from scratch.
- */
 void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_params) {
     // If update is not enabled, exit the routine.
     if (!update_enabled) {
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index 161423dba6a..c134f9af056 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -33,14 +33,15 @@ struct PlaceCritParams {
  *
  * This process can be done incrementally, based on the modified connections/AtomPinIds
  * returned by SetupTimingInfo. However, the set returned only reflects the connections
- * changed by the last call to the timing info update.
+ * changed by the last call to the timing info update (update_setup() method of SetupTimingInfo).
  *
- * Therefore, if SetupTimingInfo is updated twice in succession without criticalities
- * getting updated (update_enabled = false), the returned set cannot account for all
- * the connections that have been modified. In this case, we flag `recompute_required`
- * as false, and we recompute the criticalities for every connection to ensure that
- * they are all up to date. Hence, each time update_setup_slacks_and_criticalities()
- * is called, we assign `recompute_required` the opposite value of `update_enabled`.
+ * Therefore, if SetupTimingInfo is updated twice in a row without criticalities
+ * getting updated after the first update of SetupTimingInfo (PlacerCriticalities::update_enabled = false),
+ * the returned set of modified connections/AtomPinIds by SetupTimingInfo after its second update does not
+ * account for all the connections that have been modified.
+ * To address this issue, whenever update_criticalities() is called with flag update_enabled = false,
+ * we don't update criticalities and set flag recompute_required to true to remember that criticalities
+ * need to be recomputed from scratch in the first call to update_criticalities() with update_enabled = true.
  *
  * This class also maps/transforms the modified atom connections/pins returned by the
  * timing info into modified clustered netlist connections/pins after calling
@@ -115,6 +116,12 @@ class PlacerCriticalities {
      * keep the criticalities stored in this class in sync with the timing analyzer.
      * If out of sync, then the criticalities cannot be incrementally updated on
      * during the next timing analysis iteration.
+     *
+     * If the criticalities are not updated immediately after each time we call
+     * timing_info->update(), then timing_info->pins_with_modified_setup_criticality()
+     * cannot accurately account for all the pins that need to be updated. In this case,
+     * `recompute_required` would be true, and we update all criticalities from scratch.
+     * If the criticality exponent has changed, we also need to update from scratch.
      */
     void update_criticalities(const PlaceCritParams& crit_params);
 

From 463dd2f4b97d9c5ab67af3a12ae720dd091de588 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 16 Jan 2025 11:39:17 -0500
Subject: [PATCH 29/39] added parameter list to the doxygen comment of
 PlacerCriticalities constructor

---
 vpr/src/place/timing/PlacerCriticalities.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index c134f9af056..b03bda4eb87 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -86,7 +86,16 @@ class PlacerCriticalities {
 
   public: //Lifetime
 
-    ///@brief Allocates space for the timing_place_crit_ data structure.
+    /**
+     * @brief Allocates space for the timing_place_crit_ data structure.
+     * @param clb_nlist Used to lookup and iterate clustered netlist connections.
+     * @param netlist_pin_lookup Used to lookup Atom/Clustered pins connected to a Clustered/Atom pin.
+     * @param timing_info Holds setup timing info.
+     *
+     * @note timing_info may be shared by multiple objects with different lifetimes.
+     * To ensure timing_info is destroyed only after all its user object are destructed,
+     * each user object should hold a shared_ptr to it.
+     */
     PlacerCriticalities(const ClusteredNetlist& clb_nlist,
                         const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                         std::shared_ptr<const SetupTimingInfo> timing_info);

From 174b9a4a1e357e5db175e5bf57462ae3e8bd286b Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 16 Jan 2025 12:47:58 -0500
Subject: [PATCH 30/39] move DeltaDelayModel::read and DeltaDelayModel::write
 to its own file

---
 .../timing/delay_model/delta_delay_model.cpp  | 87 +++++++++++++++++++
 .../delay_model/override_delay_model.cpp      | 67 --------------
 2 files changed, 87 insertions(+), 67 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.cpp b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
index f4e202e7106..e8d56b09516 100644
--- a/vpr/src/place/timing/delay_model/delta_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
@@ -3,6 +3,14 @@
 
 #include "compute_delta_delays_utils.h"
 
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "place_delay_model.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif  // VTR_ENABLE_CAPNPROTO
+
 void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
                               const t_placer_opts& placer_opts,
                               const t_router_opts& router_opts,
@@ -46,3 +54,82 @@ void DeltaDelayModel::dump_echo(std::string filepath) const {
     vtr::fclose(f);
 }
 
+void DeltaDelayModel::read(const std::string& file) {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+#else
+
+    // MmapFile object creates an mmap of the specified path, and will munmap
+    // when the object leaves scope.
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+
+    // FlatArrayMessageReader is used to read the message from the data array
+    // provided by MmapFile.
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    // When reading capnproto files the Reader object to use is named
+    // <schema name>::Reader.
+    //
+    // Initially this object is an empty VprDeltaDelayModel.
+    VprDeltaDelayModel::Reader model;
+
+    // The reader.getRoot performs a cast from the generic capnproto to fit
+    // with the specified schema.
+    //
+    // Note that capnproto does not validate that the incoming data matches the
+    // schema.  If this property is required, some form of check would be
+    // required.
+    model = reader.getRoot<VprDeltaDelayModel>();
+
+    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
+        *out = in.getValue();
+    };
+
+    // ToNdMatrix is a generic function for converting a Matrix capnproto
+    // to a vtr::NdMatrix.
+    //
+    // The user must supply the matrix dimension (2 in this case), the source
+    // capnproto type (VprFloatEntry),
+    // target C++ type (flat), and a function to convert from the source capnproto
+    // type to the target C++ type (ToFloat).
+    //
+    // The second argument should be of type Matrix<X>::Reader where X is the
+    // capnproto element type.
+    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
+#endif
+}
+
+void DeltaDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "DeltaDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+#else
+
+    // MallocMessageBuilder object is the generate capnproto message builder,
+    // using malloc for buffer allocation.
+    ::capnp::MallocMessageBuilder builder;
+
+    // initRoot<X> returns a X::Builder object that can be used to set the
+    // fields in the message.
+    auto model = builder.initRoot<VprDeltaDelayModel>();
+
+    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
+        out->setValue(in);
+    };
+
+    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
+    // Matrix message.  It is the mirror function of ToNdMatrix described in
+    // read above.
+    auto delay_values = model.getDelays();
+    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, fromFloat);
+
+    // writeMessageToFile writes message to the specified file.
+    writeMessageToFile(file, &builder);
+#endif
+}
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
index ed1e53fc58a..1135f85533b 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -204,19 +204,10 @@ void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> b
  * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
  */
 #ifndef VTR_ENABLE_CAPNPROTO
-
 #    define DISABLE_ERROR                              \
         "is disable because VTR_ENABLE_CAPNPROTO=OFF." \
         "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
 
-void DeltaDelayModel::read(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::read " DISABLE_ERROR);
-}
-
-void DeltaDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "DeltaDelayModel::write " DISABLE_ERROR);
-}
-
 void OverrideDelayModel::read(const std::string& /*file*/) {
     VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::read " DISABLE_ERROR);
 }
@@ -237,64 +228,6 @@ static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
     out->setValue(in);
 }
 
-void DeltaDelayModel::read(const std::string& file) {
-    // MmapFile object creates an mmap of the specified path, and will munmap
-    // when the object leaves scope.
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-
-    // FlatArrayMessageReader is used to read the message from the data array
-    // provided by MmapFile.
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    // When reading capnproto files the Reader object to use is named
-    // <schema name>::Reader.
-    //
-    // Initially this object is an empty VprDeltaDelayModel.
-    VprDeltaDelayModel::Reader model;
-
-    // The reader.getRoot performs a cast from the generic capnproto to fit
-    // with the specified schema.
-    //
-    // Note that capnproto does not validate that the incoming data matches the
-    // schema.  If this property is required, some form of check would be
-    // required.
-    model = reader.getRoot<VprDeltaDelayModel>();
-
-    // ToNdMatrix is a generic function for converting a Matrix capnproto
-    // to a vtr::NdMatrix.
-    //
-    // The user must supply the matrix dimension (2 in this case), the source
-    // capnproto type (VprFloatEntry),
-    // target C++ type (flat), and a function to convert from the source capnproto
-    // type to the target C++ type (ToFloat).
-    //
-    // The second argument should be of type Matrix<X>::Reader where X is the
-    // capnproto element type.
-    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), ToFloat);
-}
-
-void DeltaDelayModel::write(const std::string& file) const {
-    // MallocMessageBuilder object is the generate capnproto message builder,
-    // using malloc for buffer allocation.
-    ::capnp::MallocMessageBuilder builder;
-
-    // initRoot<X> returns a X::Builder object that can be used to set the
-    // fields in the message.
-    auto model = builder.initRoot<VprDeltaDelayModel>();
-
-    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
-    // Matrix message.  It is the mirror function of ToNdMatrix described in
-    // read above.
-    auto delay_values = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, FromFloat);
-
-    // writeMessageToFile writes message to the specified file.
-    writeMessageToFile(file, &builder);
-}
-
 void OverrideDelayModel::read(const std::string& file) {
     MmapFile f(file);
 

From 5be891c4b65251f27e86f52d32196dc93821ee87 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 16 Jan 2025 12:54:59 -0500
Subject: [PATCH 31/39] move #ifndef VTR_ENABLE_CAPNPROTO to inside function
 defs instead of defining them multiple times

---
 .../delay_model/override_delay_model.cpp      | 54 ++++++++-----------
 .../timing/delay_model/simple_delay_model.cpp | 31 ++++-------
 2 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
index 1135f85533b..61acd2937b5 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -199,45 +199,25 @@ void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> b
     base_delay_model_ = std::move(base_delay_model_obj);
 }
 
-/**
- * When writing capnp targetted serialization, always allow compilation when
- * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
- */
-#ifndef VTR_ENABLE_CAPNPROTO
-#    define DISABLE_ERROR                              \
-        "is disable because VTR_ENABLE_CAPNPROTO=OFF." \
-        "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
-
-void OverrideDelayModel::read(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::read " DISABLE_ERROR);
-}
-
-void OverrideDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "OverrideDelayModel::write " DISABLE_ERROR);
-}
-
-#else /* VTR_ENABLE_CAPNPROTO */
-
-static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
-    // Getting a scalar field is always "get<field name>()".
-    *out = in.getValue();
-}
-
-static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
-    // Setting a scalar field is always "set<field name>(value)".
-    out->setValue(in);
-}
-
 void OverrideDelayModel::read(const std::string& file) {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+          "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+          "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
+#else
     MmapFile f(file);
 
     /* Increase reader limit to 1G words to allow for large files. */
     ::capnp::ReaderOptions opts = default_large_capnp_opts();
     ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
 
+    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
+        *out = in.getValue();
+    };
+
     vtr::NdMatrix<float, 4> delays;
     auto model = reader.getRoot<VprOverrideDelayModel>();
-    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), ToFloat);
+    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), toFloat);
 
     base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
 
@@ -258,14 +238,24 @@ void OverrideDelayModel::read(const std::string& file) {
     }
 
     delay_overrides_ = vtr::make_flat_map2(std::move(overrides_arr));
+#endif
 }
 
 void OverrideDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+#else
     ::capnp::MallocMessageBuilder builder;
     auto model = builder.initRoot<VprOverrideDelayModel>();
 
+    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
+        out->setValue(in);
+    };
+
     auto delays = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), FromFloat);
+    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), fromFloat);
 
     // Non-scalar capnproto fields should be first initialized with
     // init<field  name>(count), and then accessed from the returned
@@ -285,6 +275,6 @@ void OverrideDelayModel::write(const std::string& file) const {
     }
 
     writeMessageToFile(file, &builder);
+#endif
 }
 
-#endif
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.cpp b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
index 445c7e81847..1fcd86eca64 100644
--- a/vpr/src/place/timing/delay_model/simple_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
@@ -51,26 +51,12 @@ float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pi
     return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
 }
 
-/**
- * When writing capnp targetted serialization, always allow compilation when
- * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
- */
+void SimpleDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
-
-#    define DISABLE_ERROR                              \
-        "is disable because VTR_ENABLE_CAPNPROTO=OFF." \
-        "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
-
-void SimpleDelayModel::read(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "SimpleDelayModel::read " DISABLE_ERROR);
-}
-
-void SimpleDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "SimpleDelayModel::write " DISABLE_ERROR);
-}
+    VPR_THROW(VPR_ERROR_PLACE,
+              "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
 #else
-
-void SimpleDelayModel::read(const std::string& file) {
     // MmapFile object creates an mmap of the specified path, and will munmap
     // when the object leaves scope.
     MmapFile f(file);
@@ -111,9 +97,15 @@ void SimpleDelayModel::read(const std::string& file) {
     // The second argument should be of type Matrix<X>::Reader where X is the
     // capnproto element type.
     ToNdMatrix<5, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
+#endif
 }
 
 void SimpleDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+#else
     // MallocMessageBuilder object generates capnproto message builder,
     // using malloc for buffer allocation.
     ::capnp::MallocMessageBuilder builder;
@@ -134,6 +126,5 @@ void SimpleDelayModel::write(const std::string& file) const {
 
     // writeMessageToFile writes message to the specified file.
     writeMessageToFile(file, &builder);
+#endif
 }
-
-#endif
\ No newline at end of file

From 7ef8c397115004adbbc2687891395f6d7e43d1ea Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Thu, 16 Jan 2025 13:08:45 -0500
Subject: [PATCH 32/39] move delay_model directory from place/timing to under
 place/

---
 .../PlacementDelayModelCreator.cpp            |  80 ++
 .../delay_model/PlacementDelayModelCreator.h  |  30 +
 .../compute_delta_delays_utils.cpp            | 968 ++++++++++++++++++
 .../delay_model/compute_delta_delays_utils.h  |  56 +
 .../place/delay_model/delta_delay_model.cpp   | 135 +++
 vpr/src/place/delay_model/delta_delay_model.h |  47 +
 .../delay_model/override_delay_model.cpp      | 280 +++++
 .../place/delay_model/override_delay_model.h  | 112 ++
 .../place/delay_model/place_delay_model.cpp   |  78 ++
 vpr/src/place/delay_model/place_delay_model.h |  80 ++
 .../place/delay_model/simple_delay_model.cpp  | 130 +++
 .../place/delay_model/simple_delay_model.h    |  39 +
 .../delay_model/override_delay_model.cpp      |   2 +-
 .../timing/delay_model/simple_delay_model.cpp |   4 +-
 14 files changed, 2038 insertions(+), 3 deletions(-)
 create mode 100644 vpr/src/place/delay_model/PlacementDelayModelCreator.cpp
 create mode 100644 vpr/src/place/delay_model/PlacementDelayModelCreator.h
 create mode 100644 vpr/src/place/delay_model/compute_delta_delays_utils.cpp
 create mode 100644 vpr/src/place/delay_model/compute_delta_delays_utils.h
 create mode 100644 vpr/src/place/delay_model/delta_delay_model.cpp
 create mode 100644 vpr/src/place/delay_model/delta_delay_model.h
 create mode 100644 vpr/src/place/delay_model/override_delay_model.cpp
 create mode 100644 vpr/src/place/delay_model/override_delay_model.h
 create mode 100644 vpr/src/place/delay_model/place_delay_model.cpp
 create mode 100644 vpr/src/place/delay_model/place_delay_model.h
 create mode 100644 vpr/src/place/delay_model/simple_delay_model.cpp
 create mode 100644 vpr/src/place/delay_model/simple_delay_model.h

diff --git a/vpr/src/place/delay_model/PlacementDelayModelCreator.cpp b/vpr/src/place/delay_model/PlacementDelayModelCreator.cpp
new file mode 100644
index 00000000000..3482cd091e0
--- /dev/null
+++ b/vpr/src/place/delay_model/PlacementDelayModelCreator.cpp
@@ -0,0 +1,80 @@
+
+
+#include "PlacementDelayModelCreator.h"
+
+#include "place_delay_model.h"
+#include "simple_delay_model.h"
+#include "delta_delay_model.h"
+#include "override_delay_model.h"
+
+#include "vtr_time.h"
+#include "physical_types.h"
+#include "place_and_route.h"
+
+static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf) {
+    int length = 0;
+
+    for (const t_segment_inf& seg_info : segment_inf) {
+        if (seg_info.length > length) {
+            length = seg_info.length;
+        }
+    }
+
+    return length;
+}
+
+std::unique_ptr<PlaceDelayModel>
+PlacementDelayModelCreator::create_delay_model(const t_placer_opts& placer_opts,
+                                               const t_router_opts& router_opts,
+                                               const Netlist<>& net_list,
+                                               t_det_routing_arch* det_routing_arch,
+                                               std::vector<t_segment_inf>& segment_inf,
+                                               t_chan_width_dist chan_width_dist,
+                                               const std::vector<t_direct_inf>& directs,
+                                               bool is_flat) {
+    vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up");
+
+    t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist);
+
+    alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat);
+
+    const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch,
+                                                                          router_opts.lookahead_type,
+                                                                          router_opts.write_router_lookahead,
+                                                                          router_opts.read_router_lookahead,
+                                                                          segment_inf,
+                                                                          is_flat);
+
+    RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat);
+
+    int longest_length = get_longest_segment_length(segment_inf);
+
+    // now setup and compute the actual arrays
+    std::unique_ptr<PlaceDelayModel> place_delay_model;
+    float min_cross_layer_delay = get_min_cross_layer_delay();
+
+    if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) {
+        place_delay_model = std::make_unique<SimpleDelayModel>();
+    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
+        place_delay_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay, is_flat);
+    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
+        place_delay_model = std::make_unique<OverrideDelayModel>(min_cross_layer_delay, is_flat);
+    } else {
+        VTR_ASSERT_MSG(false, "Invalid placer delay model");
+    }
+
+    if (placer_opts.read_placement_delay_lookup.empty()) {
+        place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length);
+    } else {
+        place_delay_model->read(placer_opts.read_placement_delay_lookup);
+    }
+
+    if (!placer_opts.write_placement_delay_lookup.empty()) {
+        place_delay_model->write(placer_opts.write_placement_delay_lookup);
+    }
+
+    // free all data structures that are no longer needed
+    free_routing_structs();
+
+    return place_delay_model;
+}
\ No newline at end of file
diff --git a/vpr/src/place/delay_model/PlacementDelayModelCreator.h b/vpr/src/place/delay_model/PlacementDelayModelCreator.h
new file mode 100644
index 00000000000..c92b67d4854
--- /dev/null
+++ b/vpr/src/place/delay_model/PlacementDelayModelCreator.h
@@ -0,0 +1,30 @@
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "netlist.h"
+
+class PlaceDelayModel;
+struct t_placer_opts;
+struct t_router_opts;
+struct t_det_routing_arch;
+struct t_segment_inf;
+struct t_chan_width_dist;
+struct t_direct_inf;
+
+class PlacementDelayModelCreator {
+  public:
+    // nothing to do in the constructor
+    PlacementDelayModelCreator() = delete;
+
+    static std::unique_ptr<PlaceDelayModel> create_delay_model(const t_placer_opts& placer_opts,
+                                                               const t_router_opts& router_opts,
+                                                               const Netlist<>& net_list,
+                                                               t_det_routing_arch* det_routing_arch,
+                                                               std::vector<t_segment_inf>& segment_inf,
+                                                               t_chan_width_dist chan_width_dist,
+                                                               const std::vector<t_direct_inf>& directs,
+                                                               bool is_flat);
+};
diff --git a/vpr/src/place/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/delay_model/compute_delta_delays_utils.cpp
new file mode 100644
index 00000000000..725159406c0
--- /dev/null
+++ b/vpr/src/place/delay_model/compute_delta_delays_utils.cpp
@@ -0,0 +1,968 @@
+
+#include "compute_delta_delays_utils.h"
+
+#include "vtr_time.h"
+#include "vtr_math.h"
+#include "physical_types.h"
+#include "globals.h"
+#include "router_delay_profiling.h"
+
+/// Indicates the delta delay value has not been calculated
+static constexpr float UNINITIALIZED_DELTA = -1;
+/// Indicates delta delay from/to an EMPTY block
+static constexpr float EMPTY_DELTA = -2;
+/// Indicates there is no valid delta delay
+static constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity();
+
+static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
+                                                    const t_placer_opts& palcer_opts,
+                                                    const t_router_opts& router_opts,
+                                                    bool measure_directconnect,
+                                                    size_t longest_length,
+                                                    bool is_flat);
+
+static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
+
+static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
+
+static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays);
+
+static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
+                                                   vtr::Matrix<std::vector<float>>& matrix,
+                                                   int from_layer_num,
+                                                   int to_layer_num,
+                                                   int source_x,
+                                                   int source_y,
+                                                   int start_x,
+                                                   int start_y,
+                                                   int end_x,
+                                                   int end_y,
+                                                   const t_router_opts& router_opts,
+                                                   bool measure_directconnect,
+                                                   const std::set<std::string>& allowed_types,
+                                                   bool /*is_flat*/);
+
+static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& route_profiler,
+                                                      vtr::Matrix<std::vector<float>>& matrix,
+                                                      int from_layer_num,
+                                                      int to_layer_num,
+                                                      int source_x,
+                                                      int source_y,
+                                                      int start_x,
+                                                      int start_y,
+                                                      int end_x,
+                                                      int end_y,
+                                                      const t_router_opts& router_opts,
+                                                      bool measure_directconnect,
+                                                      const std::set<std::string>& allowed_types,
+                                                      bool is_flat);
+
+/**
+ * @brief Routes between a source and sink location to calculate the delay.
+ *
+ * This function computes the delay of a routed connection between a source and sink node
+ * specified by their coordinates and layers. It iterates over the best driver and sink pin
+ * classes to find a valid routing path and calculates the delay if a path exists.
+ *
+ * @param route_profiler Reference to the `RouterDelayProfiler` responsible for calculating routing delays.
+ * @param source_x The x-coordinate of the source location.
+ * @param source_y The y-coordinate of the source location.
+ * @param source_layer The layer index of the source node.
+ * @param sink_x The x-coordinate of the sink location.
+ * @param sink_y The y-coordinate of the sink location.
+ * @param sink_layer The layer index of the sink node.
+ * @param router_opts Routing options used for delay calculation.
+ * @param measure_directconnect If `true`, includes direct connect delays; otherwise, skips direct connections.
+ *
+ * @return The calculated routing delay. If routing fails, it returns `IMPOSSIBLE_DELTA`.
+ */
+static float route_connection_delay(RouterDelayProfiler& route_profiler,
+                                    int source_x,
+                                    int source_y,
+                                    int source_layer,
+                                    int sink_x,
+                                    int sink_y,
+                                    int sink_layer,
+                                    const t_router_opts& router_opts,
+                                    bool measure_directconnect);
+
+/**
+ * @brief Computes a reduced value from a vector of delay values using the specified reduction method.
+ *
+ * @param delays A reference to a vector of delay values. This vector may be modified
+ *               (e.g., sorted) depending on the reducer used.
+ * @param reducer The reduction method to be applied.
+ *
+ * @return The reduced delay value. If the input vector is empty, the function
+ *         returns `IMPOSSIBLE_DELTA`.
+ *
+ * @throws VPR_FATAL_ERROR if the reducer is unrecognized.
+ */
+static float delay_reduce(std::vector<float>& delays, e_reducer reducer);
+
+/**
+ * @brief Adds a delay value to a 2D matrix of delay vectors.
+ *
+ * Updates the delay vector at position (`delta_x`, `delta_y`) in the matrix.
+ * If the element contains only `EMPTY_DELTA`, it is replaced with the new delay;
+ * otherwise, the delay is appended to the vector.
+ *
+ * @param matrix A 2D matrix of delay vectors.
+ * @param delta_x The x-index in the matrix.
+ * @param delta_y The y-index in the matrix.
+ * @param delay The delay value to add.
+ */
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
+                                int delta_x,
+                                int delta_y,
+                                float delay);
+
+/**
+ * @brief Computes the average delay for a routing span.
+ *
+ * This function calculates the average placement delay for a routing span starting from a
+ * given layer and spanning a region defined by delta x and delta y. It iteratively searches
+ * for valid delay values within an expanding neighborhood  (starting from a distance of 1)
+ * around the specified delta offsets and layer, until valid  values are found or
+ * the maximum search distance (`max_distance`) is reached.
+ *
+ * @param matrix A 4D matrix of delay values indexed by `[from_layer][to_layer][delta_x][delta_y]`.
+ * @param from_layer The starting layer index of the routing span.
+ * @param to_tile_loc A structure holding the delta offsets (`x` and `y`) and the target layer index (`layer_num`).
+ * @param max_distance The maximum neighborhood distance to search for valid delay values.
+ *
+ * @return The average of valid delay values within the search range. If no valid delays
+ *         are found up to the maximum distance, the function returns `IMPOSSIBLE_DELTA`.
+ *
+ * @note The function performs a Manhattan-distance-based neighborhood search around the target location.
+ */
+static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
+                                      int from_layer,
+                                      t_physical_tile_loc to_tile_loc,
+                                      int max_distance);
+
+/***************************************************************************************/
+
+static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
+                                                    const t_placer_opts& placer_opts,
+                                                    const t_router_opts& router_opts,
+                                                    bool measure_directconnect,
+                                                    size_t longest_length,
+                                                    bool is_flat) {
+
+
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
+
+    const size_t num_layers = grid.get_num_layers();
+    const size_t device_width = grid.width();
+    const size_t device_height = grid.height();
+
+    /* To avoid edge effects we place the source at least 'longest_length' away
+     * from the device edge and route from there for all possible delta values < dimension
+     */
+
+    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    //   +                 |                       |               +
+    //   +        A        |           B           |       C       +
+    //   +                 |                       |               +
+    //   +-----------------\-----------------------.---------------+
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +        D        |           E           |       F       +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +                 |                       |               +
+    //   +-----------------*-----------------------/---------------+
+    //   +                 |                       |               +
+    //   +        G        |           H           |       I       +
+    //   +                 |                       |               +
+    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    //
+    //   * = (low_x, low_y)
+    //   . = (high_x, high_y)
+    //   / = (high_x, low_y)
+    //   \ = (low_x, high_y)
+    //   + = device edge
+    const size_t mid_x = vtr::nint(device_width / 2);
+    const size_t mid_y = vtr::nint(device_height / 2);
+    const size_t low_x = std::min(longest_length, mid_x);
+    const size_t low_y = std::min(longest_length, mid_y);
+    const size_t high_x = (longest_length <= device_width)  ? std::max(device_width - longest_length, mid_x) : mid_x;
+    const size_t high_y = (longest_length <= device_height) ? std::max(device_width - longest_length, mid_y) : mid_y;
+
+    vtr::NdMatrix<float, 4> delta_delays({num_layers, num_layers, device_width, device_height});
+
+    std::set<std::string> allowed_types;
+    if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
+        std::vector<std::string> allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
+        allowed_types = std::set(allowed_types_vector.begin(), allowed_types_vector.end());
+    }
+
+    for (int from_layer_num = 0; from_layer_num < (int)num_layers; from_layer_num++) {
+        for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) {
+            vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({device_width, device_height});
+
+            // Find the lowest y location on the left edge with a non-empty block
+            int y = 0;
+            int x = 0;
+            t_physical_tile_type_ptr src_type = nullptr;
+            for (x = 0; x < (int)device_width; ++x) {
+                for (y = 0; y < (int)device_height; ++y) {
+                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
+
+                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        // check if the tile type is among the allowed types
+                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
+                            continue;
+                        }
+                        src_type = type;
+                        break;
+                    }
+                }
+                if (src_type != nullptr) {
+                    break;
+                }
+            }
+            VTR_ASSERT(src_type != nullptr);
+
+            auto generic_compute_matrix = (placer_opts.place_delta_delay_matrix_calculation_method == e_place_delta_delay_algorithm::ASTAR_ROUTE) ? generic_compute_matrix_iterative_astar : generic_compute_matrix_dijkstra_expansion;
+
+#ifdef VERBOSE
+            VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y);
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   x, y,
+                                   x, y,
+                                   device_width - 1, device_height - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            // Find the lowest x location on the bottom edge with a non-empty block
+            src_type = nullptr;
+            for (y = 0; y < (int)device_height; ++y) {
+                for (x = 0; x < (int)device_width; ++x) {
+                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
+
+                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                        // check if the tile type is among the allowed types
+                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
+                            continue;
+                        }
+                        src_type = type;
+                        break;
+                    }
+                }
+                if (src_type) {
+                    break;
+                }
+            }
+            VTR_ASSERT(src_type != nullptr);
+#ifdef VERBOSE
+            VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y);
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   x, y,
+                                   x, y,
+                                   device_width - 1, device_height - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions B, C, E, F
+#ifdef VERBOSE
+            VTR_LOG("Computing from low/low:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   low_x, low_y,
+                                   low_x, low_y,
+                                   device_width - 1, device_height - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions D, E, G, H
+#ifdef VERBOSE
+            VTR_LOG("Computing from high/high:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   high_x, high_y,
+                                   0, 0,
+                                   high_x, high_y,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions A, B, D, E
+#ifdef VERBOSE
+            VTR_LOG("Computing from high/low:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   high_x, low_y,
+                                   0, low_y,
+                                   high_x, device_height - 1,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+
+            //Since the other delta delay values may have suffered from edge effects,
+            //we recalculate deltas within regions E, F, H, I
+#ifdef VERBOSE
+            VTR_LOG("Computing from low/high:\n");
+#endif
+            generic_compute_matrix(route_profiler, sampled_delta_delays,
+                                   from_layer_num, to_layer_num,
+                                   low_x, high_y,
+                                   low_x, 0,
+                                   device_width - 1, high_y,
+                                   router_opts,
+                                   measure_directconnect, allowed_types,
+                                   is_flat);
+            for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) {
+                for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) {
+                    delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer);
+                }
+            }
+        }
+    }
+
+    return delta_delays;
+}
+
+static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
+    // Set any empty delta's to the average of its neighbours
+    //
+    // Empty coordinates may occur if the sampling location happens to not have
+    // a connection at that location. However, a more thorough sampling likely
+    // would return a result, so we fill in the empty holes with a small
+    // neighbour average.
+    constexpr int kMaxAverageDistance = 2;
+    for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) {
+        for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) {
+            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
+                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
+                    if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) {
+                        delta_delays[from_layer][to_layer][delta_x][delta_y] =
+                            find_neighboring_average(delta_delays,
+                                                     from_layer,
+                                                     {delta_x, delta_y, to_layer},
+                                                     kMaxAverageDistance);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
+    // Set any impossible delta's to the average of its neighbours
+    //
+    // Impossible coordinates may occur if an IPIN cannot be reached from the
+    // sampling OPIN.  This might occur if the IPIN or OPIN used for sampling
+    // is specialized, and therefore cannot be reached via the by the pins
+    // sampled.  Leaving this value in the delay matrix will result in invalid
+    // slacks if the delay matrix uses this value.
+    //
+    // A max average distance of 5 is used to provide increased effort in
+    // filling these gaps.  It is more important to have a poor predication,
+    // than an invalid value and causing a slack assertion.
+    constexpr int kMaxAverageDistance = 5;
+    for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) {
+        for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) {
+            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
+                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
+                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) {
+                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average(
+                            delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
+
+    for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) {
+        for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) {
+            for (size_t x = 0; x < grid.width(); ++x) {
+                for (size_t y = 0; y < grid.height(); ++y) {
+                    float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y];
+
+                    if (delta_delay < 0.) {
+                        VPR_ERROR(VPR_ERROR_PLACE,
+                                  "Found invalid negative delay %g for delta [%d,%d,%d,%d]",
+                                  delta_delay, from_layer_num, to_layer_num, x, y);
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
+                                                   vtr::Matrix<std::vector<float>>& matrix,
+                                                   int from_layer_num,
+                                                   int to_layer_num,
+                                                   int source_x,
+                                                   int source_y,
+                                                   int start_x,
+                                                   int start_y,
+                                                   int end_x,
+                                                   int end_y,
+                                                   const t_router_opts& router_opts,
+                                                   bool measure_directconnect,
+                                                   const std::set<std::string>& allowed_types,
+                                                   bool /*is_flat*/) {
+    const auto& device_ctx = g_vpr_ctx.device();
+
+    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+            const int delta_x = abs(sink_x - source_x);
+            const int delta_y = abs(sink_y - source_y);
+
+            t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
+            t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
+
+            bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE
+                                        || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE);
+
+            bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
+
+            if (src_or_target_empty || !is_allowed_type) {
+                if (matrix[delta_x][delta_y].empty()) {
+                    // Only set empty target if we don't already have a valid delta delay
+                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                            "EMPTY",
+                            delta_x, delta_y,
+                            source_x, source_y,
+                            sink_x, sink_y);
+#endif
+                }
+            } else {
+                // Valid start/end
+                float delay = route_connection_delay(route_profiler,
+                                                     source_x,
+                                                     source_y,
+                                                     from_layer_num,
+                                                     sink_x,
+                                                     sink_y,
+                                                     to_layer_num,
+                                                     router_opts,
+                                                     measure_directconnect);
+
+#ifdef VERBOSE
+                VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                        delay,
+                        delta_x, delta_y,
+                        source_x, source_y,
+                        sink_x, sink_y);
+#endif
+                if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
+                    // Overwrite empty delta
+                    matrix[delta_x][delta_y][0] = delay;
+                } else {
+                    // Collect delta
+                    matrix[delta_x][delta_y].push_back(delay);
+                }
+            }
+        }
+    }
+}
+
+static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*route_profiler*/,
+                                                      vtr::Matrix<std::vector<float>>& matrix,
+                                                      int from_layer_num,
+                                                      int to_layer_num,
+                                                      int source_x,
+                                                      int source_y,
+                                                      int start_x,
+                                                      int start_y,
+                                                      int end_x,
+                                                      int end_y,
+                                                      const t_router_opts& router_opts,
+                                                      bool measure_directconnect,
+                                                      const std::set<std::string>& allowed_types,
+                                                      bool is_flat) {
+    const auto& device_ctx = g_vpr_ctx.device();
+
+    t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
+    bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
+    if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) {
+        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+                int delta_x = abs(sink_x - source_x);
+                int delta_y = abs(sink_y - source_y);
+
+                if (matrix[delta_x][delta_y].empty()) {
+                    //Only set empty target if we don't already have a valid delta delay
+                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                            "EMPTY",
+                            delta_x, delta_y,
+                            source_x, source_y,
+                            sink_x, sink_y);
+#endif
+                }
+            }
+        }
+
+        return;
+    }
+
+    vtr::Matrix<bool> found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false);
+
+    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
+    for (int driver_ptc : best_driver_ptcs) {
+        VTR_ASSERT(driver_ptc != OPEN);
+        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
+
+        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
+        auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat);
+
+        bool path_to_all_sinks = true;
+        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+                int delta_x = abs(sink_x - source_x);
+                int delta_y = abs(sink_y - source_y);
+
+                if (found_matrix[delta_x][delta_y]) {
+                    continue;
+                }
+
+                t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
+                if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
+                    if (matrix[delta_x][delta_y].empty()) {
+                        // Only set empty target if we don't already have a valid delta delay
+                        matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
+#ifdef VERBOSE
+                        VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                                "EMPTY",
+                                delta_x, delta_y,
+                                source_x, source_y,
+                                sink_x, sink_y);
+#endif
+                        found_matrix[delta_x][delta_y] = true;
+                    }
+                } else {
+                    bool found_a_sink = false;
+                    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
+                    for (int sink_ptc : best_sink_ptcs) {
+                        VTR_ASSERT(sink_ptc != OPEN);
+                        RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
+
+                        if (sink_rr_node == RRNodeId::INVALID())
+                            continue;
+
+                        if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
+                            // Skip if we shouldn't measure direct connects and a direct connect exists
+                            continue;
+                        }
+
+                        if (std::isnan(delays[sink_rr_node])) {
+                            // This sink was not found
+                            continue;
+                        }
+
+#ifdef VERBOSE
+                        VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
+                                delays[size_t(sink_rr_node)],
+                                delta_x, delta_y,
+                                source_x, source_y,
+                                sink_x, sink_y);
+#endif
+                        found_matrix[delta_x][delta_y] = true;
+
+                        add_delay_to_matrix(matrix, delta_x, delta_y, delays[sink_rr_node]);
+
+                        found_a_sink = true;
+                        break;
+                    }
+
+                    if (!found_a_sink) {
+                        path_to_all_sinks = false;
+                    }
+                }
+            }
+        }
+
+        if (path_to_all_sinks) {
+            break;
+        }
+    }
+
+    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
+        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
+            int delta_x = abs(sink_x - source_x);
+            int delta_y = abs(sink_y - source_y);
+            if (!found_matrix[delta_x][delta_y]) {
+                add_delay_to_matrix(matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
+                VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
+                             source_x,
+                             source_y,
+                             from_layer_num,
+                             sink_x,
+                             sink_y,
+                             to_layer_num,
+                             IMPOSSIBLE_DELTA);
+            }
+        }
+    }
+}
+
+static float route_connection_delay(RouterDelayProfiler& route_profiler,
+                                    int source_x,
+                                    int source_y,
+                                    int source_layer,
+                                    int sink_x,
+                                    int sink_y,
+                                    int sink_layer,
+                                    const t_router_opts& router_opts,
+                                    bool measure_directconnect) {
+    //Routes between the source and sink locations and calculates the delay
+
+    // set to known value for debug purposes
+    float net_delay_value = IMPOSSIBLE_DELTA;
+
+    const auto& device_ctx = g_vpr_ctx.device();
+
+    bool successfully_routed = false;
+
+    // Get the rr nodes to route between
+    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, source_layer}));
+    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, sink_layer}));
+
+    for (int driver_ptc : best_driver_ptcs) {
+        VTR_ASSERT(driver_ptc != OPEN);
+        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(source_layer, source_x, source_y, SOURCE, driver_ptc);
+
+        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
+
+        for (int sink_ptc : best_sink_ptcs) {
+            VTR_ASSERT(sink_ptc != OPEN);
+            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(sink_layer, sink_x, sink_y, SINK, sink_ptc);
+
+            if (sink_rr_node == RRNodeId::INVALID())
+                continue;
+
+            if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
+                // Skip if we shouldn't measure direct connects and a direct connect exists
+                continue;
+            }
+
+            successfully_routed = route_profiler.calculate_delay(source_rr_node,
+                                                                 sink_rr_node,
+                                                                 router_opts,
+                                                                 &net_delay_value);
+
+            if (successfully_routed) break;
+        }
+        if (successfully_routed) break;
+    }
+
+    if (!successfully_routed) {
+        VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
+                     source_x, source_y, source_layer, sink_x, sink_y, sink_layer, net_delay_value);
+    }
+
+    return net_delay_value;
+}
+
+static float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
+    if (delays.empty()) {
+        return IMPOSSIBLE_DELTA;
+    }
+
+    if (delays.size() == 1) {
+        return delays[0];
+    }
+
+    VTR_ASSERT(delays.size() > 1);
+
+    float delay;
+
+    if (reducer == e_reducer::MIN) {
+        auto itr = std::min_element(delays.begin(), delays.end());
+        delay = *itr;
+    } else if (reducer == e_reducer::MAX) {
+        auto itr = std::max_element(delays.begin(), delays.end());
+        delay = *itr;
+    } else if (reducer == e_reducer::MEDIAN) {
+        std::stable_sort(delays.begin(), delays.end());
+        delay = vtr::median(delays.begin(), delays.end());
+    } else if (reducer == e_reducer::ARITHMEAN) {
+        delay = vtr::arithmean(delays.begin(), delays.end());
+    } else if (reducer == e_reducer::GEOMEAN) {
+        delay = vtr::geomean(delays.begin(), delays.end());
+    } else {
+        VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer");
+    }
+
+    return delay;
+}
+
+static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
+                                int delta_x,
+                                int delta_y,
+                                float delay) {
+    if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
+        // Overwrite empty delta
+        matrix[delta_x][delta_y][0] = delay;
+    } else {
+        // Collect delta
+        matrix[delta_x][delta_y].push_back(delay);
+    }
+}
+
+static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
+                                      int from_layer,
+                                      t_physical_tile_loc to_tile_loc,
+                                      int max_distance) {
+    float sum = 0.f;
+    int num_samples = 0;
+    const int endx = matrix.end_index(2);
+    const int endy = matrix.end_index(3);
+
+    const int x = to_tile_loc.x;
+    const int y = to_tile_loc.y;
+    const int to_layer = to_tile_loc.layer_num;
+
+    for (int distance = 1; distance <= max_distance; ++distance) {
+        for (int delx = x - distance; delx <= x + distance; delx++) {
+            for (int dely = y - distance; dely <= y + distance; dely++) {
+                // Check distance constraint
+                if (abs(delx - x) + abs(dely - y) > distance) {
+                    continue;
+                }
+
+                //check out of bounds
+                if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) {
+                    continue;
+                }
+
+                if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) {
+                    continue;
+                }
+
+                sum += matrix[from_layer][to_layer][delx][dely];
+                num_samples++;
+            }
+        }
+
+        if (num_samples != 0) {
+            return sum / (float)num_samples;
+        }
+    }
+
+    return IMPOSSIBLE_DELTA;
+}
+
+/***************************************************************************************/
+
+vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
+                                                  const t_placer_opts& placer_opts,
+                                                  const t_router_opts& router_opts,
+                                                  bool measure_directconnect,
+                                                  int longest_length,
+                                                  bool is_flat) {
+    vtr::ScopedStartFinishTimer timer("Computing delta delays");
+    vtr::NdMatrix<float, 4> delta_delays = compute_delta_delays(route_profiler,
+                                                                placer_opts,
+                                                                router_opts,
+                                                                measure_directconnect,
+                                                                longest_length,
+                                                                is_flat);
+
+    const size_t num_elements = delta_delays.size();
+
+    // set uninitialized elements to infinity
+    for (size_t i = 0; i < num_elements; i++) {
+        if (delta_delays.get(i) == UNINITIALIZED_DELTA) {
+            delta_delays.get(i) = IMPOSSIBLE_DELTA;
+        }
+    }
+
+    fix_empty_coordinates(delta_delays);
+
+    fill_impossible_coordinates(delta_delays);
+
+    verify_delta_delays(delta_delays);
+
+    return delta_delays;
+}
+
+//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
+bool find_direct_connect_sample_locations(const t_direct_inf* direct,
+                                          t_physical_tile_type_ptr from_type,
+                                          int from_pin,
+                                          int from_pin_class,
+                                          t_physical_tile_type_ptr to_type,
+                                          int to_pin,
+                                          int to_pin_class,
+                                          RRNodeId& out_src_node,
+                                          RRNodeId& out_sink_node) {
+    VTR_ASSERT(from_type != nullptr);
+    VTR_ASSERT(to_type != nullptr);
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& grid = device_ctx.grid;
+    const auto& node_lookup = device_ctx.rr_graph.node_lookup();
+
+    //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets,
+    //and which has the appropriate pins
+    int from_x = -1;
+    int from_y = -1;
+    int from_sub_tile = -1;
+    int to_x = 0, to_y = 0, to_sub_tile = 0;
+    bool found = false;
+    int found_layer_num = -1;
+    //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums
+    for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) {
+        for (int x = 0; x < (int)grid.width() && !found; ++x) {
+            to_x = x + direct->x_offset;
+            if (to_x < 0 || to_x >= (int)grid.width()) continue;
+
+            for (int y = 0; y < (int)grid.height() && !found; ++y) {
+                if (grid.get_physical_type({x, y, layer_num}) != from_type) continue;
+
+                //Check that the from pin exists at this from location
+                //(with multi-width/height blocks pins may not exist at all locations)
+                bool from_pin_found = false;
+                if (direct->from_side != NUM_2D_SIDES) {
+                    RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side);
+                    from_pin_found = from_pin_rr.is_valid();
+                } else {
+                    from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty());
+                }
+                if (!from_pin_found) continue;
+
+                to_y = y + direct->y_offset;
+
+                if (to_y < 0 || to_y >= (int)grid.height()) continue;
+                if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue;
+
+                //Check that the from pin exists at this from location
+                //(with multi-width/height blocks pins may not exist at all locations)
+                bool to_pin_found = false;
+                if (direct->to_side != NUM_2D_SIDES) {
+                    RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side);
+                    to_pin_found = (to_pin_rr != RRNodeId::INVALID());
+                } else {
+                    to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty());
+                }
+                if (!to_pin_found) continue;
+
+                for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) {
+                    to_sub_tile = sub_tile_num + direct->sub_tile_offset;
+
+                    if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue;
+
+                    found = true;
+                    found_layer_num = layer_num;
+                    from_x = x;
+                    from_y = y;
+                    from_sub_tile = sub_tile_num;
+
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!found) {
+        return false;
+    }
+
+    //Now have a legal instance of this direct connect
+    VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type);
+    VTR_ASSERT(from_sub_tile < from_type->capacity);
+
+    VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type);
+    VTR_ASSERT(to_sub_tile < to_type->capacity);
+
+    VTR_ASSERT(from_x + direct->x_offset == to_x);
+    VTR_ASSERT(from_y + direct->y_offset == to_y);
+    VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile);
+
+    // Find a source/sink RR node associated with the pins of the direct
+    {
+        RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class);
+        VTR_ASSERT(src_rr_candidate);
+        out_src_node = src_rr_candidate;
+    }
+
+    {
+        RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class);
+        VTR_ASSERT(sink_rr_candidate);
+        out_sink_node = sink_rr_candidate;
+    }
+
+    return true;
+}
+
+std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
+    std::vector<int> best_classes;
+
+    //Record any non-zero Fc pins
+    //
+    //Note that we track non-zero Fc pins, since certain Fc overrides
+    //may apply to only a subset of wire types. This ensures we record
+    //which pins can potentially connect to global routing.
+    std::unordered_set<int> non_zero_fc_pins;
+    for (const t_fc_specification& fc_spec : type->fc_specs) {
+        if (fc_spec.fc_value == 0) continue;
+
+        non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end());
+    }
+
+    // Collect all classes of matching type which connect to general routing
+    for (int i = 0; i < (int)type->class_inf.size(); i++) {
+        if (type->class_inf[i].type == pintype) {
+            //Check whether all pins in this class are ignored or have zero fc
+            bool any_pins_connect_to_general_routing = false;
+            for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) {
+                int pin = type->class_inf[i].pinlist[ipin];
+                //If the pin isn't ignored, and has a non-zero Fc to some general
+                //routing the class is suitable for delay profiling
+                if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) {
+                    any_pins_connect_to_general_routing = true;
+                    break;
+                }
+            }
+
+            // Skip if the pin class doesn't connect to general routing
+            if (!any_pins_connect_to_general_routing) continue;
+
+            // Record candidate class
+            best_classes.push_back(i);
+        }
+    }
+
+    // Sort classes so the largest pin class is first
+    auto cmp_class = [&](int lhs, int rhs) {
+        return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
+    };
+
+    std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class);
+
+    return best_classes;
+}
\ No newline at end of file
diff --git a/vpr/src/place/delay_model/compute_delta_delays_utils.h b/vpr/src/place/delay_model/compute_delta_delays_utils.h
new file mode 100644
index 00000000000..71ac632b149
--- /dev/null
+++ b/vpr/src/place/delay_model/compute_delta_delays_utils.h
@@ -0,0 +1,56 @@
+
+#pragma once
+
+#include "vtr_ndmatrix.h"
+#include "physical_types.h"
+#include "rr_graph_fwd.h"
+
+struct t_placer_opts;
+struct t_router_opts;
+class RouterDelayProfiler;
+
+vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
+                                                  const t_placer_opts& placer_opts,
+                                                  const t_router_opts& router_opts,
+                                                  bool measure_directconnect,
+                                                  int longest_length,
+                                                  bool is_flat);
+
+bool find_direct_connect_sample_locations(const t_direct_inf* direct,
+                                          t_physical_tile_type_ptr from_type,
+                                          int from_pin,
+                                          int from_pin_class,
+                                          t_physical_tile_type_ptr to_type,
+                                          int to_pin,
+                                          int to_pin_class,
+                                          RRNodeId& out_src_node,
+                                          RRNodeId& out_sink_node);
+
+/**
+ * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity.
+ *
+ * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`)
+ * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins
+ * that connect to general routing, ensuring commonly used pins are chosen for delay profiling.
+ *
+ * @param pintype The type of pins to filter.
+ * @param type Pointer to the physical tile type containing pin and class information.
+ *
+ * @return A vector of indices representing the selected pin classes. The classes are sorted
+ *         in descending order based on the number of pins they contain.
+ *
+ * @details
+ * - A pin class is eligible if its type matches `pintype` and it contains at least one pin
+ *   that connects to general routing (non-zero Fc).
+ * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`.
+ * - Classes are sorted so that the class with the largest number of pins appears first.
+ *   If multiple classes have the same pin count, their order depends on their initial appearance
+ *   in the architecture file.
+ *
+ * @note
+ * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded.
+ * - The function ensures stability in sorting, preserving the input order for classes
+ *   with the same number of pins.
+ */
+
+std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type);
\ No newline at end of file
diff --git a/vpr/src/place/delay_model/delta_delay_model.cpp b/vpr/src/place/delay_model/delta_delay_model.cpp
new file mode 100644
index 00000000000..e8d56b09516
--- /dev/null
+++ b/vpr/src/place/delay_model/delta_delay_model.cpp
@@ -0,0 +1,135 @@
+
+#include "delta_delay_model.h"
+
+#include "compute_delta_delays_utils.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "place_delay_model.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif  // VTR_ENABLE_CAPNPROTO
+
+void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
+                              const t_placer_opts& placer_opts,
+                              const t_router_opts& router_opts,
+                              int longest_length) {
+    delays_ = compute_delta_delay_model(route_profiler,
+                                        placer_opts,
+                                        router_opts,
+                                        /*measure_directconnect=*/true,
+                                        longest_length,
+                                        is_flat_);
+}
+
+float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/,
+                             const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
+    int delta_x = std::abs(from_loc.x - to_loc.x);
+    int delta_y = std::abs(from_loc.y - to_loc.y);
+
+    return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
+}
+
+void DeltaDelayModel::dump_echo(std::string filepath) const {
+    FILE* f = vtr::fopen(filepath.c_str(), "w");
+    fprintf(f, "         ");
+    for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) {
+        for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) {
+            fprintf(f, " %9zu", from_layer_num);
+            fprintf(f, "\n");
+            for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
+                fprintf(f, " %9zu", dx);
+            }
+            fprintf(f, "\n");
+            for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) {
+                fprintf(f, "%9zu", dy);
+                for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
+                    fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]);
+                }
+                fprintf(f, "\n");
+            }
+        }
+    }
+    vtr::fclose(f);
+}
+
+void DeltaDelayModel::read(const std::string& file) {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+#else
+
+    // MmapFile object creates an mmap of the specified path, and will munmap
+    // when the object leaves scope.
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+
+    // FlatArrayMessageReader is used to read the message from the data array
+    // provided by MmapFile.
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    // When reading capnproto files the Reader object to use is named
+    // <schema name>::Reader.
+    //
+    // Initially this object is an empty VprDeltaDelayModel.
+    VprDeltaDelayModel::Reader model;
+
+    // The reader.getRoot performs a cast from the generic capnproto to fit
+    // with the specified schema.
+    //
+    // Note that capnproto does not validate that the incoming data matches the
+    // schema.  If this property is required, some form of check would be
+    // required.
+    model = reader.getRoot<VprDeltaDelayModel>();
+
+    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
+        *out = in.getValue();
+    };
+
+    // ToNdMatrix is a generic function for converting a Matrix capnproto
+    // to a vtr::NdMatrix.
+    //
+    // The user must supply the matrix dimension (2 in this case), the source
+    // capnproto type (VprFloatEntry),
+    // target C++ type (flat), and a function to convert from the source capnproto
+    // type to the target C++ type (ToFloat).
+    //
+    // The second argument should be of type Matrix<X>::Reader where X is the
+    // capnproto element type.
+    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
+#endif
+}
+
+void DeltaDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "DeltaDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+#else
+
+    // MallocMessageBuilder object is the generate capnproto message builder,
+    // using malloc for buffer allocation.
+    ::capnp::MallocMessageBuilder builder;
+
+    // initRoot<X> returns a X::Builder object that can be used to set the
+    // fields in the message.
+    auto model = builder.initRoot<VprDeltaDelayModel>();
+
+    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
+        out->setValue(in);
+    };
+
+    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
+    // Matrix message.  It is the mirror function of ToNdMatrix described in
+    // read above.
+    auto delay_values = model.getDelays();
+    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, fromFloat);
+
+    // writeMessageToFile writes message to the specified file.
+    writeMessageToFile(file, &builder);
+#endif
+}
diff --git a/vpr/src/place/delay_model/delta_delay_model.h b/vpr/src/place/delay_model/delta_delay_model.h
new file mode 100644
index 00000000000..c3ae0d83cf7
--- /dev/null
+++ b/vpr/src/place/delay_model/delta_delay_model.h
@@ -0,0 +1,47 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+
+/**
+ * @class DeltaDelayModel
+ *
+ * @brief A simple delay model based on the distance (delta) between block locations.
+ */
+class DeltaDelayModel : public PlaceDelayModel {
+  public:
+    DeltaDelayModel(float min_cross_layer_delay,
+                    bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    DeltaDelayModel(float min_cross_layer_delay,
+                    vtr::NdMatrix<float, 4> delta_delays,
+                    bool is_flat)
+        : delays_(std::move(delta_delays))
+        , cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    void compute(RouterDelayProfiler& router,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
+
+    void dump_echo(std::string filepath) const override;
+
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
+
+    const vtr::NdMatrix<float, 4>& delays() const {
+        return delays_;
+    }
+
+  private:
+    vtr::NdMatrix<float, 4> delays_; // [0..num_layers-1][0..max_dx][0..max_dy]
+    float cross_layer_delay_;
+
+    /// Indicates whether the router is a two-stage or run-flat
+    bool is_flat_;
+};
\ No newline at end of file
diff --git a/vpr/src/place/delay_model/override_delay_model.cpp b/vpr/src/place/delay_model/override_delay_model.cpp
new file mode 100644
index 00000000000..61acd2937b5
--- /dev/null
+++ b/vpr/src/place/delay_model/override_delay_model.cpp
@@ -0,0 +1,280 @@
+
+#include "override_delay_model.h"
+
+#include "compute_delta_delays_utils.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "place_delay_model.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif  // VTR_ENABLE_CAPNPROTO
+
+void OverrideDelayModel::compute(RouterDelayProfiler& route_profiler,
+                                 const t_placer_opts& placer_opts,
+                                 const t_router_opts& router_opts,
+                                 int longest_length) {
+    auto delays = compute_delta_delay_model(route_profiler,
+                                            placer_opts,
+                                            router_opts,
+                                            /*measure_directconnect=*/false,
+                                            longest_length,
+                                            is_flat_);
+
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
+
+    compute_override_delay_model_(route_profiler, router_opts);
+}
+
+void OverrideDelayModel::compute_override_delay_model_(RouterDelayProfiler& route_profiler,
+                                                       const t_router_opts& router_opts) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    t_router_opts router_opts2 = router_opts;
+    router_opts2.astar_fac = 0.f;
+    router_opts2.astar_offset = 0.f;
+
+    // Look at all the direct connections that exist, and add overrides to delay model
+    for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) {
+        const t_direct_inf* direct = &device_ctx.arch->directs[idirect];
+
+        InstPort from_port = parse_inst_port(direct->from_pin);
+        InstPort to_port = parse_inst_port(direct->to_pin);
+
+        t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types);
+        t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types);
+
+        int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1;
+        VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from");
+
+        //We now walk through all the connections associated with the current direct specification, measure
+        //their delay and specify that value as an override in the delay model.
+        //
+        //Note that we need to check every connection in the direct to cover the case where the pins are not
+        //equivalent.
+        //
+        //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK
+        //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in
+        //sampled_rr_pairs and skipping them if they occur multiple times.
+        int missing_instances = 0;
+        int missing_paths = 0;
+        std::set<std::pair<RRNodeId, RRNodeId>> sampled_rr_pairs;
+        for (int iconn = 0; iconn < num_conns; ++iconn) {
+            //Find the associated pins
+            int from_pin = from_type->find_pin(from_port.port_name(), from_port.port_low_index() + iconn);
+            int to_pin = to_type->find_pin(to_port.port_name(), to_port.port_low_index() + iconn);
+
+            VTR_ASSERT(from_pin != OPEN);
+            VTR_ASSERT(to_pin != OPEN);
+
+            int from_pin_class = from_type->find_pin_class(from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
+            VTR_ASSERT(from_pin_class != OPEN);
+
+            int to_pin_class = to_type->find_pin_class(to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
+            VTR_ASSERT(to_pin_class != OPEN);
+
+            bool found_sample_points;
+            RRNodeId src_rr, sink_rr;
+            found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr);
+
+            if (!found_sample_points) {
+                ++missing_instances;
+                continue;
+            }
+
+            //If some of the source/sink ports are logically equivalent we may have already
+            //sampled the associated source/sink pair and don't need to do so again
+            if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;
+
+            float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
+            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);
+
+            if (found_routing_path) {
+                set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
+            } else {
+                ++missing_paths;
+            }
+
+            //Record that we've sampled this pair of source and sink nodes
+            sampled_rr_pairs.insert({src_rr, sink_rr});
+        }
+
+        VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str());
+        VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str());
+    }
+}
+
+const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
+    return base_delay_model_.get();
+}
+
+float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const {
+    // First check to if there is an override delay value
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& grid = device_ctx.grid;
+
+    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc);
+    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc);
+
+    t_override override_key;
+    override_key.from_type = from_type_ptr->index;
+    override_key.from_class = from_type_ptr->pin_class[from_pin];
+    override_key.to_type = to_type_ptr->index;
+    override_key.to_class = to_type_ptr->pin_class[to_pin];
+
+    //Delay overrides may be different for +/- delta so do not use
+    //an absolute delta for the look-up
+    override_key.delta_x = to_loc.x - from_loc.x;
+    override_key.delta_y = to_loc.y - from_loc.y;
+
+    float delay_val = std::numeric_limits<float>::quiet_NaN();
+    auto override_iter = delay_overrides_.find(override_key);
+    if (override_iter != delay_overrides_.end()) {
+        //Found an override
+        delay_val = override_iter->second;
+    } else {
+        //Fall back to the base delay model if no override was found
+        delay_val = base_delay_model_->delay(from_loc, from_pin, to_loc, to_pin);
+    }
+
+    return delay_val;
+}
+
+void OverrideDelayModel::set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay_val) {
+    t_override override_key;
+    override_key.from_type = from_type;
+    override_key.from_class = from_class;
+    override_key.to_type = to_type;
+    override_key.to_class = to_class;
+    override_key.delta_x = delta_x;
+    override_key.delta_y = delta_y;
+
+    auto res = delay_overrides_.insert(std::make_pair(override_key, delay_val));
+    if (!res.second) {                 //Key already exists
+        res.first->second = delay_val; //Overwrite existing delay
+    }
+}
+
+void OverrideDelayModel::dump_echo(std::string filepath) const {
+    base_delay_model_->dump_echo(filepath);
+
+    FILE* f = vtr::fopen(filepath.c_str(), "a");
+
+    fprintf(f, "\n");
+    fprintf(f, "# Delay Overrides\n");
+    auto& device_ctx = g_vpr_ctx.device();
+    for (auto kv : delay_overrides_) {
+        auto override_key = kv.first;
+        float delay_val = kv.second;
+        fprintf(f, "from_type: %s to_type: %s from_pin_class: %d to_pin_class: %d delta_x: %d delta_y: %d -> delay: %g\n",
+                device_ctx.physical_tile_types[override_key.from_type].name.c_str(),
+                device_ctx.physical_tile_types[override_key.to_type].name.c_str(),
+                override_key.from_class,
+                override_key.to_class,
+                override_key.delta_x,
+                override_key.delta_y,
+                delay_val);
+    }
+
+    vtr::fclose(f);
+}
+
+float OverrideDelayModel::get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const {
+    t_override key;
+    key.from_type = from_type;
+    key.from_class = from_class;
+    key.to_type = to_type;
+    key.to_class = to_class;
+    key.delta_x = delta_x;
+    key.delta_y = delta_y;
+
+    auto iter = delay_overrides_.find(key);
+    if (iter == delay_overrides_.end()) {
+        VPR_THROW(VPR_ERROR_PLACE, "Key not found.");
+    }
+    return iter->second;
+}
+
+void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
+    base_delay_model_ = std::move(base_delay_model_obj);
+}
+
+void OverrideDelayModel::read(const std::string& file) {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+          "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+          "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
+#else
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
+        *out = in.getValue();
+    };
+
+    vtr::NdMatrix<float, 4> delays;
+    auto model = reader.getRoot<VprOverrideDelayModel>();
+    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), toFloat);
+
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
+
+    // Reading non-scalar capnproto fields is roughly equivilant to using
+    // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
+    auto overrides = model.getDelayOverrides();
+    std::vector<std::pair<t_override, float> > overrides_arr(overrides.size());
+    for (size_t i = 0; i < overrides.size(); ++i) {
+        const auto& elem = overrides[i];
+        overrides_arr[i].first.from_type = elem.getFromType();
+        overrides_arr[i].first.to_type = elem.getToType();
+        overrides_arr[i].first.from_class = elem.getFromClass();
+        overrides_arr[i].first.to_class = elem.getToClass();
+        overrides_arr[i].first.delta_x = elem.getDeltaX();
+        overrides_arr[i].first.delta_y = elem.getDeltaY();
+
+        overrides_arr[i].second = elem.getDelay();
+    }
+
+    delay_overrides_ = vtr::make_flat_map2(std::move(overrides_arr));
+#endif
+}
+
+void OverrideDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+#else
+    ::capnp::MallocMessageBuilder builder;
+    auto model = builder.initRoot<VprOverrideDelayModel>();
+
+    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
+        out->setValue(in);
+    };
+
+    auto delays = model.getDelays();
+    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), fromFloat);
+
+    // Non-scalar capnproto fields should be first initialized with
+    // init<field  name>(count), and then accessed from the returned
+    // std::vector-like Builder object (specifically capnp::List<X>::Builder).
+    auto overrides = model.initDelayOverrides(delay_overrides_.size());
+    auto dst_iter = overrides.begin();
+    for (const auto& src : delay_overrides_) {
+        auto elem = *dst_iter++;
+        elem.setFromType(src.first.from_type);
+        elem.setToType(src.first.to_type);
+        elem.setFromClass(src.first.from_class);
+        elem.setToClass(src.first.to_class);
+        elem.setDeltaX(src.first.delta_x);
+        elem.setDeltaY(src.first.delta_y);
+
+        elem.setDelay(src.second);
+    }
+
+    writeMessageToFile(file, &builder);
+#endif
+}
+
diff --git a/vpr/src/place/delay_model/override_delay_model.h b/vpr/src/place/delay_model/override_delay_model.h
new file mode 100644
index 00000000000..5965261c272
--- /dev/null
+++ b/vpr/src/place/delay_model/override_delay_model.h
@@ -0,0 +1,112 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+#include "delta_delay_model.h"
+
+class OverrideDelayModel : public PlaceDelayModel {
+  public:
+    OverrideDelayModel(float min_cross_layer_delay,
+                       bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+
+    void compute(RouterDelayProfiler& route_profiler,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    /**
+     * @brief returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the
+     * specified from and to pins
+     */
+    float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override;
+
+    void dump_echo(std::string filepath) const override;
+
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
+
+  public: //Mutators
+    void set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model);
+    const DeltaDelayModel* base_delay_model() const;
+    float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const;
+    void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay);
+
+  private:
+    std::unique_ptr<DeltaDelayModel> base_delay_model_;
+    /// Minimum delay of cross-layer connections
+    float cross_layer_delay_;
+
+    /// Indicates whether the router is a two-stage or run-flat
+    bool is_flat_;
+
+    void compute_override_delay_model_(RouterDelayProfiler& router,
+                                       const t_router_opts& router_opts);
+
+    /**
+     * @brief Structure that allows delays to be queried from the delay model.
+     *
+     * Delay is calculated given the origin physical tile, the origin
+     * pin, the destination physical tile, and the destination pin.
+     * This structure encapsulates all these information.
+     *
+     *   @param from_type, to_type
+     *              Physical tile index (for easy array access)
+     *   @param from_class, to_class
+     *              The class that the pins belongs to.
+     *   @param to_x, to_y
+     *              The horizontal and vertical displacement
+     *              between two physical tiles.
+     */
+    struct t_override {
+        short from_type;
+        short to_type;
+        short from_class;
+        short to_class;
+        short delta_x;
+        short delta_y;
+
+        /**
+         * @brief Comparison operator designed for performance.
+         *
+         * Operator< is important since t_override serves as the key into the
+         * map structure delay_overrides_. A default comparison operator would
+         * not be inlined by the compiler.
+         *
+         * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
+         * is required for operator< to be inlined by compiler. Proper inlining of
+         * the function reduces place time by around 5%.
+         *
+         * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
+         */
+        friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
+            const short* left = reinterpret_cast<const short*>(&lhs);
+            const short* right = reinterpret_cast<const short*>(&rhs);
+            constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short);
+            return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS);
+        }
+    };
+
+    /**
+     * @brief Map data structure that returns delay values according to
+     *        specific delay model queries.
+     *
+     * Delay model queries are provided by the t_override structure, which
+     * encapsulates the information regarding the origin and the destination.
+     */
+    vtr::flat_map2<t_override, float> delay_overrides_;
+
+    /**
+     * operator< treats memory layout of t_override as an array of short.
+     * This requires all members of t_override are shorts and there is no
+     * padding between members of t_override.
+     */
+    static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
+    static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts");
+    static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts");
+};
\ No newline at end of file
diff --git a/vpr/src/place/delay_model/place_delay_model.cpp b/vpr/src/place/delay_model/place_delay_model.cpp
new file mode 100644
index 00000000000..04267e0e5f1
--- /dev/null
+++ b/vpr/src/place/delay_model/place_delay_model.cpp
@@ -0,0 +1,78 @@
+/**
+ * @file place_delay_model.cpp
+ * @brief This file implements all the class methods and individual
+ *        routines related to the placer delay model.
+ */
+
+#include "place_delay_model.h"
+
+#include "globals.h"
+#include "router_lookahead_map.h"
+#include "placer_state.h"
+#include "vpr_error.h"
+
+/**
+ * @brief Returns the delay of one point to point connection.
+ *
+ * Only estimate delay for signals routed through the inter-block routing network.
+ * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
+ */
+float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
+                                      const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
+                                      ClusterNetId net_id,
+                                      int ipin) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    float delay_source_to_sink = 0.;
+
+    if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+        ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
+        ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
+
+        ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
+        ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
+
+        int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
+        int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
+
+        t_pl_loc source_block_loc = block_locs[source_block].loc;
+        t_pl_loc sink_block_loc = block_locs[sink_block].loc;
+
+        /**
+         * This heuristic only considers delta_x and delta_y, a much better
+         * heuristic would be to to create a more comprehensive lookup table.
+         *
+         * In particular this approach does not accurately capture the effect
+         * of fast carry-chain connections.
+         */
+        delay_source_to_sink = delay_model->delay({source_block_loc.x, source_block_loc.y, source_block_loc.layer}, source_block_ipin,
+                                                  {sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer}, sink_block_ipin);
+        if (delay_source_to_sink < 0) {
+            VPR_ERROR(VPR_ERROR_PLACE,
+                      "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d,%d) to %s (at %d,%d,%d)\n"
+                      "in comp_td_single_connection_delay: Delay is less than 0\n",
+                      block_type_pin_index_to_name(physical_tile_type(source_block_loc), source_block_ipin, false).c_str(),
+                      source_block_loc.x, source_block_loc.y, source_block_loc.layer,
+                      block_type_pin_index_to_name(physical_tile_type(sink_block_loc), sink_block_ipin, false).c_str(),
+                      sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer,
+                      delay_source_to_sink);
+        }
+    }
+
+    return (delay_source_to_sink);
+}
+
+///@brief Recompute all point to point delays, updating `connection_delay` matrix.
+void comp_td_connection_delays(const PlaceDelayModel* delay_model,
+                               PlacerState& placer_state) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& p_timing_ctx = placer_state.mutable_timing();
+    auto& block_locs = placer_state.block_locs();
+    auto& connection_delay = p_timing_ctx.connection_delay;
+
+    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
+            connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, block_locs, net_id, ipin);
+        }
+    }
+}
diff --git a/vpr/src/place/delay_model/place_delay_model.h b/vpr/src/place/delay_model/place_delay_model.h
new file mode 100644
index 00000000000..27c89591071
--- /dev/null
+++ b/vpr/src/place/delay_model/place_delay_model.h
@@ -0,0 +1,80 @@
+/**
+ * @file place_delay_model.h
+ * @brief This file contains all the class and function declarations related to
+ *        the placer delay model. For implementations, see place_delay_model.cpp.
+ */
+
+#pragma once
+
+#include "vtr_ndmatrix.h"
+#include "vtr_flat_map.h"
+#include "vpr_types.h"
+#include "router_delay_profiling.h"
+
+#ifndef __has_attribute
+#    define __has_attribute(x) 0 // Compatibility with non-clang compilers.
+#endif
+
+#if defined(COMPILER_GCC) && defined(NDEBUG)
+#    define ALWAYS_INLINE inline __attribute__((__always_inline__))
+#elif defined(COMPILER_MSVC) && defined(NDEBUG)
+#    define ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline)
+#    define ALWAYS_INLINE __attribute__((always_inline)) // clang
+#else
+#    define ALWAYS_INLINE inline
+#endif
+
+///@brief Forward declarations.
+class PlaceDelayModel;
+class PlacerState;
+
+///@brief Returns the delay of one point to point connection.
+float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
+                                      const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
+                                      ClusterNetId net_id,
+                                      int ipin);
+
+///@brief Recompute all point to point delays, updating `connection_delay` matrix.
+void comp_td_connection_delays(const PlaceDelayModel* delay_model,
+                               PlacerState& placer_state);
+
+///@brief Abstract interface to a placement delay model.
+class PlaceDelayModel {
+  public:
+    virtual ~PlaceDelayModel() = default;
+
+    ///@brief Computes place delay model.
+    virtual void compute(RouterDelayProfiler& route_profiler,
+                         const t_placer_opts& placer_opts,
+                         const t_router_opts& router_opts,
+                         int longest_length)
+        = 0;
+
+    /**
+     * @brief Returns the delay estimate between the specified block pins.
+     *
+     * Either compute or read methods must be invoked before invoking delay.
+     */
+    virtual float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const = 0;
+
+    ///@brief Dumps the delay model to an echo file.
+    virtual void dump_echo(std::string filename) const = 0;
+
+    /**
+     * @brief Write place delay model to specified file.
+     *
+     * May be unimplemented, in which case method should throw an exception.
+     */
+    virtual void write(const std::string& file) const = 0;
+
+    /**
+     * @brief Read place delay model from specified file.
+     *
+     * May be unimplemented, in which case method should throw an exception.
+     */
+    virtual void read(const std::string& file) = 0;
+};
+
+
+
diff --git a/vpr/src/place/delay_model/simple_delay_model.cpp b/vpr/src/place/delay_model/simple_delay_model.cpp
new file mode 100644
index 00000000000..1fcd86eca64
--- /dev/null
+++ b/vpr/src/place/delay_model/simple_delay_model.cpp
@@ -0,0 +1,130 @@
+
+#include "simple_delay_model.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "place_delay_model.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif  // VTR_ENABLE_CAPNPROTO
+
+void SimpleDelayModel::compute(RouterDelayProfiler& route_profiler,
+                               const t_placer_opts& /*placer_opts*/,
+                               const t_router_opts& /*router_opts*/,
+                               int /*longest_length*/) {
+    const auto& grid = g_vpr_ctx.device().grid;
+    const size_t num_physical_tile_types = g_vpr_ctx.device().physical_tile_types.size();
+    const size_t num_layers = grid.get_num_layers();
+
+    // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height]
+    // The second index related to the layer that the source location is on and the third index is for the sink layer
+    delays_ = vtr::NdMatrix<float, 5>({num_physical_tile_types,
+                                       num_layers,
+                                       num_layers,
+                                       grid.width(),
+                                       grid.height()});
+
+    for (size_t physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) {
+        for (size_t from_layer = 0; from_layer < num_layers; ++from_layer) {
+            for (size_t to_layer = 0; to_layer < num_layers; ++to_layer) {
+                for (size_t dx = 0; dx < grid.width(); ++dx) {
+                    for (size_t dy = 0; dy < grid.height(); ++dy) {
+                        float min_delay = route_profiler.get_min_delay(physical_tile_type_idx,
+                                                                       from_layer,
+                                                                       to_layer,
+                                                                       dx,
+                                                                       dy);
+                        delays_[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay;
+                    }
+                }
+            }
+        }
+    }
+}
+
+float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
+    int delta_x = std::abs(from_loc.x - to_loc.x);
+    int delta_y = std::abs(from_loc.y - to_loc.y);
+
+    int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index;
+    return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
+}
+
+void SimpleDelayModel::read(const std::string& file) {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+#else
+    // MmapFile object creates an mmap of the specified path, and will munmap
+    // when the object leaves scope.
+    MmapFile f(file);
+
+    /* Increase reader limit to 1G words to allow for large files. */
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+
+    // FlatArrayMessageReader is used to read the message from the data array
+    // provided by MmapFile.
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    // When reading capnproto files the Reader object to use is named
+    // <schema name>::Reader.
+    //
+    // Initially this object is an empty VprDeltaDelayModel.
+    VprDeltaDelayModel::Reader model;
+
+    // The reader.getRoot performs a cast from the generic capnproto to fit
+    // with the specified schema.
+    //
+    // Note that capnproto does not validate that the incoming data matches the
+    // schema.  If this property is required, some form of check would be
+    // required.
+    model = reader.getRoot<VprDeltaDelayModel>();
+
+    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
+        *out = in.getValue();
+    };
+
+    // ToNdMatrix is a generic function for converting a Matrix capnproto
+    // to a vtr::NdMatrix.
+    //
+    // The user must supply the matrix dimension (5 in this case), the source
+    // capnproto type (VprFloatEntry),
+    // target C++ type (flat), and a function to convert from the source capnproto
+    // type to the target C++ type (ToFloat).
+    //
+    // The second argument should be of type Matrix<X>::Reader where X is the
+    // capnproto element type.
+    ToNdMatrix<5, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
+#endif
+}
+
+void SimpleDelayModel::write(const std::string& file) const {
+#ifndef VTR_ENABLE_CAPNPROTO
+    VPR_THROW(VPR_ERROR_PLACE,
+              "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+#else
+    // MallocMessageBuilder object generates capnproto message builder,
+    // using malloc for buffer allocation.
+    ::capnp::MallocMessageBuilder builder;
+
+    // initRoot<X> returns a X::Builder object that can be used to set the
+    // fields in the message.
+    auto model = builder.initRoot<VprDeltaDelayModel>();
+
+    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
+        out->setValue(in);
+    };
+
+    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
+    // Matrix message.  It is the mirror function of ToNdMatrix described in
+    // read above.
+    auto delay_values = model.getDelays();
+    FromNdMatrix<5, VprFloatEntry, float>(&delay_values, delays_, fromFloat);
+
+    // writeMessageToFile writes message to the specified file.
+    writeMessageToFile(file, &builder);
+#endif
+}
diff --git a/vpr/src/place/delay_model/simple_delay_model.h b/vpr/src/place/delay_model/simple_delay_model.h
new file mode 100644
index 00000000000..25dce08c4fc
--- /dev/null
+++ b/vpr/src/place/delay_model/simple_delay_model.h
@@ -0,0 +1,39 @@
+
+#pragma once
+
+#include "place_delay_model.h"
+
+/**
+ * @class SimpleDelayModel
+ * @brief A simple delay model based on the information stored in router lookahead
+ * This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router
+ */
+class SimpleDelayModel : public PlaceDelayModel {
+  public:
+    SimpleDelayModel() {}
+
+    /// @brief Use the information in the router lookahead to fill the delay matrix instead of running the router
+    void compute(RouterDelayProfiler& router,
+                 const t_placer_opts& placer_opts,
+                 const t_router_opts& router_opts,
+                 int longest_length) override;
+
+    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
+
+    void dump_echo(std::string /*filepath*/) const override {}
+
+    void read(const std::string& /*file*/) override;
+    void write(const std::string& /*file*/) const override;
+
+  private:
+    /**
+     * @brief The matrix to store the minimum delay between different points on different layers.
+     *
+     *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers
+     *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs
+     *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers
+     *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1.
+     *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular.
+     */
+    vtr::NdMatrix<float, 5> delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy]
+};
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
index 61acd2937b5..6cbb2c7f654 100644
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp
@@ -245,7 +245,7 @@ void OverrideDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     ::capnp::MallocMessageBuilder builder;
     auto model = builder.initRoot<VprOverrideDelayModel>();
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.cpp b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
index 1fcd86eca64..dac18890366 100644
--- a/vpr/src/place/timing/delay_model/simple_delay_model.cpp
+++ b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
@@ -55,7 +55,7 @@ void SimpleDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     // MmapFile object creates an mmap of the specified path, and will munmap
     // when the object leaves scope.
@@ -104,7 +104,7 @@ void SimpleDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     // MallocMessageBuilder object generates capnproto message builder,
     // using malloc for buffer allocation.

From 94cfd6ff2a1403e6315608cd405fe2973bc09820 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Sun, 19 Jan 2025 17:07:35 -0500
Subject: [PATCH 33/39] delete duplicate files

---
 .../PlacementDelayModelCreator.cpp            |  80 --
 .../delay_model/PlacementDelayModelCreator.h  |  30 -
 .../compute_delta_delays_utils.cpp            | 968 ------------------
 .../delay_model/compute_delta_delays_utils.h  |  56 -
 .../timing/delay_model/delta_delay_model.cpp  | 135 ---
 .../timing/delay_model/delta_delay_model.h    |  47 -
 .../delay_model/override_delay_model.cpp      | 280 -----
 .../timing/delay_model/override_delay_model.h | 112 --
 .../timing/delay_model/place_delay_model.cpp  |  78 --
 .../timing/delay_model/place_delay_model.h    |  80 --
 .../timing/delay_model/simple_delay_model.cpp | 130 ---
 .../timing/delay_model/simple_delay_model.h   |  39 -
 12 files changed, 2035 deletions(-)
 delete mode 100644 vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
 delete mode 100644 vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
 delete mode 100644 vpr/src/place/timing/delay_model/delta_delay_model.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/delta_delay_model.h
 delete mode 100644 vpr/src/place/timing/delay_model/override_delay_model.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/override_delay_model.h
 delete mode 100644 vpr/src/place/timing/delay_model/place_delay_model.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/place_delay_model.h
 delete mode 100644 vpr/src/place/timing/delay_model/simple_delay_model.cpp
 delete mode 100644 vpr/src/place/timing/delay_model/simple_delay_model.h

diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
deleted file mode 100644
index 3482cd091e0..00000000000
--- a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-
-
-#include "PlacementDelayModelCreator.h"
-
-#include "place_delay_model.h"
-#include "simple_delay_model.h"
-#include "delta_delay_model.h"
-#include "override_delay_model.h"
-
-#include "vtr_time.h"
-#include "physical_types.h"
-#include "place_and_route.h"
-
-static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf) {
-    int length = 0;
-
-    for (const t_segment_inf& seg_info : segment_inf) {
-        if (seg_info.length > length) {
-            length = seg_info.length;
-        }
-    }
-
-    return length;
-}
-
-std::unique_ptr<PlaceDelayModel>
-PlacementDelayModelCreator::create_delay_model(const t_placer_opts& placer_opts,
-                                               const t_router_opts& router_opts,
-                                               const Netlist<>& net_list,
-                                               t_det_routing_arch* det_routing_arch,
-                                               std::vector<t_segment_inf>& segment_inf,
-                                               t_chan_width_dist chan_width_dist,
-                                               const std::vector<t_direct_inf>& directs,
-                                               bool is_flat) {
-    vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up");
-
-    t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist);
-
-    alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat);
-
-    const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch,
-                                                                          router_opts.lookahead_type,
-                                                                          router_opts.write_router_lookahead,
-                                                                          router_opts.read_router_lookahead,
-                                                                          segment_inf,
-                                                                          is_flat);
-
-    RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat);
-
-    int longest_length = get_longest_segment_length(segment_inf);
-
-    // now setup and compute the actual arrays
-    std::unique_ptr<PlaceDelayModel> place_delay_model;
-    float min_cross_layer_delay = get_min_cross_layer_delay();
-
-    if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) {
-        place_delay_model = std::make_unique<SimpleDelayModel>();
-    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
-        place_delay_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay, is_flat);
-    } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
-        place_delay_model = std::make_unique<OverrideDelayModel>(min_cross_layer_delay, is_flat);
-    } else {
-        VTR_ASSERT_MSG(false, "Invalid placer delay model");
-    }
-
-    if (placer_opts.read_placement_delay_lookup.empty()) {
-        place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length);
-    } else {
-        place_delay_model->read(placer_opts.read_placement_delay_lookup);
-    }
-
-    if (!placer_opts.write_placement_delay_lookup.empty()) {
-        place_delay_model->write(placer_opts.write_placement_delay_lookup);
-    }
-
-    // free all data structures that are no longer needed
-    free_routing_structs();
-
-    return place_delay_model;
-}
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
deleted file mode 100644
index c92b67d4854..00000000000
--- a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h
+++ /dev/null
@@ -1,30 +0,0 @@
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "netlist.h"
-
-class PlaceDelayModel;
-struct t_placer_opts;
-struct t_router_opts;
-struct t_det_routing_arch;
-struct t_segment_inf;
-struct t_chan_width_dist;
-struct t_direct_inf;
-
-class PlacementDelayModelCreator {
-  public:
-    // nothing to do in the constructor
-    PlacementDelayModelCreator() = delete;
-
-    static std::unique_ptr<PlaceDelayModel> create_delay_model(const t_placer_opts& placer_opts,
-                                                               const t_router_opts& router_opts,
-                                                               const Netlist<>& net_list,
-                                                               t_det_routing_arch* det_routing_arch,
-                                                               std::vector<t_segment_inf>& segment_inf,
-                                                               t_chan_width_dist chan_width_dist,
-                                                               const std::vector<t_direct_inf>& directs,
-                                                               bool is_flat);
-};
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
deleted file mode 100644
index 725159406c0..00000000000
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp
+++ /dev/null
@@ -1,968 +0,0 @@
-
-#include "compute_delta_delays_utils.h"
-
-#include "vtr_time.h"
-#include "vtr_math.h"
-#include "physical_types.h"
-#include "globals.h"
-#include "router_delay_profiling.h"
-
-/// Indicates the delta delay value has not been calculated
-static constexpr float UNINITIALIZED_DELTA = -1;
-/// Indicates delta delay from/to an EMPTY block
-static constexpr float EMPTY_DELTA = -2;
-/// Indicates there is no valid delta delay
-static constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity();
-
-static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
-                                                    const t_placer_opts& palcer_opts,
-                                                    const t_router_opts& router_opts,
-                                                    bool measure_directconnect,
-                                                    size_t longest_length,
-                                                    bool is_flat);
-
-static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
-
-static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays);
-
-static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays);
-
-static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
-                                                   vtr::Matrix<std::vector<float>>& matrix,
-                                                   int from_layer_num,
-                                                   int to_layer_num,
-                                                   int source_x,
-                                                   int source_y,
-                                                   int start_x,
-                                                   int start_y,
-                                                   int end_x,
-                                                   int end_y,
-                                                   const t_router_opts& router_opts,
-                                                   bool measure_directconnect,
-                                                   const std::set<std::string>& allowed_types,
-                                                   bool /*is_flat*/);
-
-static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& route_profiler,
-                                                      vtr::Matrix<std::vector<float>>& matrix,
-                                                      int from_layer_num,
-                                                      int to_layer_num,
-                                                      int source_x,
-                                                      int source_y,
-                                                      int start_x,
-                                                      int start_y,
-                                                      int end_x,
-                                                      int end_y,
-                                                      const t_router_opts& router_opts,
-                                                      bool measure_directconnect,
-                                                      const std::set<std::string>& allowed_types,
-                                                      bool is_flat);
-
-/**
- * @brief Routes between a source and sink location to calculate the delay.
- *
- * This function computes the delay of a routed connection between a source and sink node
- * specified by their coordinates and layers. It iterates over the best driver and sink pin
- * classes to find a valid routing path and calculates the delay if a path exists.
- *
- * @param route_profiler Reference to the `RouterDelayProfiler` responsible for calculating routing delays.
- * @param source_x The x-coordinate of the source location.
- * @param source_y The y-coordinate of the source location.
- * @param source_layer The layer index of the source node.
- * @param sink_x The x-coordinate of the sink location.
- * @param sink_y The y-coordinate of the sink location.
- * @param sink_layer The layer index of the sink node.
- * @param router_opts Routing options used for delay calculation.
- * @param measure_directconnect If `true`, includes direct connect delays; otherwise, skips direct connections.
- *
- * @return The calculated routing delay. If routing fails, it returns `IMPOSSIBLE_DELTA`.
- */
-static float route_connection_delay(RouterDelayProfiler& route_profiler,
-                                    int source_x,
-                                    int source_y,
-                                    int source_layer,
-                                    int sink_x,
-                                    int sink_y,
-                                    int sink_layer,
-                                    const t_router_opts& router_opts,
-                                    bool measure_directconnect);
-
-/**
- * @brief Computes a reduced value from a vector of delay values using the specified reduction method.
- *
- * @param delays A reference to a vector of delay values. This vector may be modified
- *               (e.g., sorted) depending on the reducer used.
- * @param reducer The reduction method to be applied.
- *
- * @return The reduced delay value. If the input vector is empty, the function
- *         returns `IMPOSSIBLE_DELTA`.
- *
- * @throws VPR_FATAL_ERROR if the reducer is unrecognized.
- */
-static float delay_reduce(std::vector<float>& delays, e_reducer reducer);
-
-/**
- * @brief Adds a delay value to a 2D matrix of delay vectors.
- *
- * Updates the delay vector at position (`delta_x`, `delta_y`) in the matrix.
- * If the element contains only `EMPTY_DELTA`, it is replaced with the new delay;
- * otherwise, the delay is appended to the vector.
- *
- * @param matrix A 2D matrix of delay vectors.
- * @param delta_x The x-index in the matrix.
- * @param delta_y The y-index in the matrix.
- * @param delay The delay value to add.
- */
-static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
-                                int delta_x,
-                                int delta_y,
-                                float delay);
-
-/**
- * @brief Computes the average delay for a routing span.
- *
- * This function calculates the average placement delay for a routing span starting from a
- * given layer and spanning a region defined by delta x and delta y. It iteratively searches
- * for valid delay values within an expanding neighborhood  (starting from a distance of 1)
- * around the specified delta offsets and layer, until valid  values are found or
- * the maximum search distance (`max_distance`) is reached.
- *
- * @param matrix A 4D matrix of delay values indexed by `[from_layer][to_layer][delta_x][delta_y]`.
- * @param from_layer The starting layer index of the routing span.
- * @param to_tile_loc A structure holding the delta offsets (`x` and `y`) and the target layer index (`layer_num`).
- * @param max_distance The maximum neighborhood distance to search for valid delay values.
- *
- * @return The average of valid delay values within the search range. If no valid delays
- *         are found up to the maximum distance, the function returns `IMPOSSIBLE_DELTA`.
- *
- * @note The function performs a Manhattan-distance-based neighborhood search around the target location.
- */
-static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
-                                      int from_layer,
-                                      t_physical_tile_loc to_tile_loc,
-                                      int max_distance);
-
-/***************************************************************************************/
-
-static vtr::NdMatrix<float, 4> compute_delta_delays(RouterDelayProfiler& route_profiler,
-                                                    const t_placer_opts& placer_opts,
-                                                    const t_router_opts& router_opts,
-                                                    bool measure_directconnect,
-                                                    size_t longest_length,
-                                                    bool is_flat) {
-
-
-    const auto& device_ctx = g_vpr_ctx.device();
-    const auto& grid = device_ctx.grid;
-
-    const size_t num_layers = grid.get_num_layers();
-    const size_t device_width = grid.width();
-    const size_t device_height = grid.height();
-
-    /* To avoid edge effects we place the source at least 'longest_length' away
-     * from the device edge and route from there for all possible delta values < dimension
-     */
-
-    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-    //   +                 |                       |               +
-    //   +        A        |           B           |       C       +
-    //   +                 |                       |               +
-    //   +-----------------\-----------------------.---------------+
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +        D        |           E           |       F       +
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +                 |                       |               +
-    //   +-----------------*-----------------------/---------------+
-    //   +                 |                       |               +
-    //   +        G        |           H           |       I       +
-    //   +                 |                       |               +
-    //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-    //
-    //   * = (low_x, low_y)
-    //   . = (high_x, high_y)
-    //   / = (high_x, low_y)
-    //   \ = (low_x, high_y)
-    //   + = device edge
-    const size_t mid_x = vtr::nint(device_width / 2);
-    const size_t mid_y = vtr::nint(device_height / 2);
-    const size_t low_x = std::min(longest_length, mid_x);
-    const size_t low_y = std::min(longest_length, mid_y);
-    const size_t high_x = (longest_length <= device_width)  ? std::max(device_width - longest_length, mid_x) : mid_x;
-    const size_t high_y = (longest_length <= device_height) ? std::max(device_width - longest_length, mid_y) : mid_y;
-
-    vtr::NdMatrix<float, 4> delta_delays({num_layers, num_layers, device_width, device_height});
-
-    std::set<std::string> allowed_types;
-    if (!placer_opts.allowed_tiles_for_delay_model.empty()) {
-        std::vector<std::string> allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ",");
-        allowed_types = std::set(allowed_types_vector.begin(), allowed_types_vector.end());
-    }
-
-    for (int from_layer_num = 0; from_layer_num < (int)num_layers; from_layer_num++) {
-        for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) {
-            vtr::NdMatrix<std::vector<float>, 2> sampled_delta_delays({device_width, device_height});
-
-            // Find the lowest y location on the left edge with a non-empty block
-            int y = 0;
-            int x = 0;
-            t_physical_tile_type_ptr src_type = nullptr;
-            for (x = 0; x < (int)device_width; ++x) {
-                for (y = 0; y < (int)device_height; ++y) {
-                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
-
-                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                        // check if the tile type is among the allowed types
-                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
-                            continue;
-                        }
-                        src_type = type;
-                        break;
-                    }
-                }
-                if (src_type != nullptr) {
-                    break;
-                }
-            }
-            VTR_ASSERT(src_type != nullptr);
-
-            auto generic_compute_matrix = (placer_opts.place_delta_delay_matrix_calculation_method == e_place_delta_delay_algorithm::ASTAR_ROUTE) ? generic_compute_matrix_iterative_astar : generic_compute_matrix_dijkstra_expansion;
-
-#ifdef VERBOSE
-            VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y);
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   x, y,
-                                   x, y,
-                                   device_width - 1, device_height - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            // Find the lowest x location on the bottom edge with a non-empty block
-            src_type = nullptr;
-            for (y = 0; y < (int)device_height; ++y) {
-                for (x = 0; x < (int)device_width; ++x) {
-                    t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num});
-
-                    if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                        // check if the tile type is among the allowed types
-                        if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) {
-                            continue;
-                        }
-                        src_type = type;
-                        break;
-                    }
-                }
-                if (src_type) {
-                    break;
-                }
-            }
-            VTR_ASSERT(src_type != nullptr);
-#ifdef VERBOSE
-            VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y);
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   x, y,
-                                   x, y,
-                                   device_width - 1, device_height - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions B, C, E, F
-#ifdef VERBOSE
-            VTR_LOG("Computing from low/low:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   low_x, low_y,
-                                   low_x, low_y,
-                                   device_width - 1, device_height - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions D, E, G, H
-#ifdef VERBOSE
-            VTR_LOG("Computing from high/high:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   high_x, high_y,
-                                   0, 0,
-                                   high_x, high_y,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions A, B, D, E
-#ifdef VERBOSE
-            VTR_LOG("Computing from high/low:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   high_x, low_y,
-                                   0, low_y,
-                                   high_x, device_height - 1,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-
-            //Since the other delta delay values may have suffered from edge effects,
-            //we recalculate deltas within regions E, F, H, I
-#ifdef VERBOSE
-            VTR_LOG("Computing from low/high:\n");
-#endif
-            generic_compute_matrix(route_profiler, sampled_delta_delays,
-                                   from_layer_num, to_layer_num,
-                                   low_x, high_y,
-                                   low_x, 0,
-                                   device_width - 1, high_y,
-                                   router_opts,
-                                   measure_directconnect, allowed_types,
-                                   is_flat);
-            for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) {
-                for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) {
-                    delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer);
-                }
-            }
-        }
-    }
-
-    return delta_delays;
-}
-
-static void fix_empty_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any empty delta's to the average of its neighbours
-    //
-    // Empty coordinates may occur if the sampling location happens to not have
-    // a connection at that location. However, a more thorough sampling likely
-    // would return a result, so we fill in the empty holes with a small
-    // neighbour average.
-    constexpr int kMaxAverageDistance = 2;
-    for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) {
-        for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) {
-            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
-                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) {
-                        delta_delays[from_layer][to_layer][delta_x][delta_y] =
-                            find_neighboring_average(delta_delays,
-                                                     from_layer,
-                                                     {delta_x, delta_y, to_layer},
-                                                     kMaxAverageDistance);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void fill_impossible_coordinates(vtr::NdMatrix<float, 4>& delta_delays) {
-    // Set any impossible delta's to the average of its neighbours
-    //
-    // Impossible coordinates may occur if an IPIN cannot be reached from the
-    // sampling OPIN.  This might occur if the IPIN or OPIN used for sampling
-    // is specialized, and therefore cannot be reached via the by the pins
-    // sampled.  Leaving this value in the delay matrix will result in invalid
-    // slacks if the delay matrix uses this value.
-    //
-    // A max average distance of 5 is used to provide increased effort in
-    // filling these gaps.  It is more important to have a poor predication,
-    // than an invalid value and causing a slack assertion.
-    constexpr int kMaxAverageDistance = 5;
-    for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) {
-        for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) {
-            for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) {
-                for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) {
-                    if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) {
-                        delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average(
-                            delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static bool verify_delta_delays(const vtr::NdMatrix<float, 4>& delta_delays) {
-    const auto& device_ctx = g_vpr_ctx.device();
-    const auto& grid = device_ctx.grid;
-
-    for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) {
-        for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) {
-            for (size_t x = 0; x < grid.width(); ++x) {
-                for (size_t y = 0; y < grid.height(); ++y) {
-                    float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y];
-
-                    if (delta_delay < 0.) {
-                        VPR_ERROR(VPR_ERROR_PLACE,
-                                  "Found invalid negative delay %g for delta [%d,%d,%d,%d]",
-                                  delta_delay, from_layer_num, to_layer_num, x, y);
-                    }
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler,
-                                                   vtr::Matrix<std::vector<float>>& matrix,
-                                                   int from_layer_num,
-                                                   int to_layer_num,
-                                                   int source_x,
-                                                   int source_y,
-                                                   int start_x,
-                                                   int start_y,
-                                                   int end_x,
-                                                   int end_y,
-                                                   const t_router_opts& router_opts,
-                                                   bool measure_directconnect,
-                                                   const std::set<std::string>& allowed_types,
-                                                   bool /*is_flat*/) {
-    const auto& device_ctx = g_vpr_ctx.device();
-
-    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-            const int delta_x = abs(sink_x - source_x);
-            const int delta_y = abs(sink_y - source_y);
-
-            t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
-            t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
-
-            bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE
-                                        || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE);
-
-            bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
-
-            if (src_or_target_empty || !is_allowed_type) {
-                if (matrix[delta_x][delta_y].empty()) {
-                    // Only set empty target if we don't already have a valid delta delay
-                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                            "EMPTY",
-                            delta_x, delta_y,
-                            source_x, source_y,
-                            sink_x, sink_y);
-#endif
-                }
-            } else {
-                // Valid start/end
-                float delay = route_connection_delay(route_profiler,
-                                                     source_x,
-                                                     source_y,
-                                                     from_layer_num,
-                                                     sink_x,
-                                                     sink_y,
-                                                     to_layer_num,
-                                                     router_opts,
-                                                     measure_directconnect);
-
-#ifdef VERBOSE
-                VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                        delay,
-                        delta_x, delta_y,
-                        source_x, source_y,
-                        sink_x, sink_y);
-#endif
-                if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
-                    // Overwrite empty delta
-                    matrix[delta_x][delta_y][0] = delay;
-                } else {
-                    // Collect delta
-                    matrix[delta_x][delta_y].push_back(delay);
-                }
-            }
-        }
-    }
-}
-
-static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*route_profiler*/,
-                                                      vtr::Matrix<std::vector<float>>& matrix,
-                                                      int from_layer_num,
-                                                      int to_layer_num,
-                                                      int source_x,
-                                                      int source_y,
-                                                      int start_x,
-                                                      int start_y,
-                                                      int end_x,
-                                                      int end_y,
-                                                      const t_router_opts& router_opts,
-                                                      bool measure_directconnect,
-                                                      const std::set<std::string>& allowed_types,
-                                                      bool is_flat) {
-    const auto& device_ctx = g_vpr_ctx.device();
-
-    t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num});
-    bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end();
-    if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) {
-        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-                int delta_x = abs(sink_x - source_x);
-                int delta_y = abs(sink_y - source_y);
-
-                if (matrix[delta_x][delta_y].empty()) {
-                    //Only set empty target if we don't already have a valid delta delay
-                    matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                    VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                            "EMPTY",
-                            delta_x, delta_y,
-                            source_x, source_y,
-                            sink_x, sink_y);
-#endif
-                }
-            }
-        }
-
-        return;
-    }
-
-    vtr::Matrix<bool> found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false);
-
-    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}));
-    for (int driver_ptc : best_driver_ptcs) {
-        VTR_ASSERT(driver_ptc != OPEN);
-        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc);
-
-        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
-        auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat);
-
-        bool path_to_all_sinks = true;
-        for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-            for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-                int delta_x = abs(sink_x - source_x);
-                int delta_y = abs(sink_y - source_y);
-
-                if (found_matrix[delta_x][delta_y]) {
-                    continue;
-                }
-
-                t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num});
-                if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) {
-                    if (matrix[delta_x][delta_y].empty()) {
-                        // Only set empty target if we don't already have a valid delta delay
-                        matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
-#ifdef VERBOSE
-                        VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                                "EMPTY",
-                                delta_x, delta_y,
-                                source_x, source_y,
-                                sink_x, sink_y);
-#endif
-                        found_matrix[delta_x][delta_y] = true;
-                    }
-                } else {
-                    bool found_a_sink = false;
-                    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}));
-                    for (int sink_ptc : best_sink_ptcs) {
-                        VTR_ASSERT(sink_ptc != OPEN);
-                        RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc);
-
-                        if (sink_rr_node == RRNodeId::INVALID())
-                            continue;
-
-                        if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                            // Skip if we shouldn't measure direct connects and a direct connect exists
-                            continue;
-                        }
-
-                        if (std::isnan(delays[sink_rr_node])) {
-                            // This sink was not found
-                            continue;
-                        }
-
-#ifdef VERBOSE
-                        VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
-                                delays[size_t(sink_rr_node)],
-                                delta_x, delta_y,
-                                source_x, source_y,
-                                sink_x, sink_y);
-#endif
-                        found_matrix[delta_x][delta_y] = true;
-
-                        add_delay_to_matrix(matrix, delta_x, delta_y, delays[sink_rr_node]);
-
-                        found_a_sink = true;
-                        break;
-                    }
-
-                    if (!found_a_sink) {
-                        path_to_all_sinks = false;
-                    }
-                }
-            }
-        }
-
-        if (path_to_all_sinks) {
-            break;
-        }
-    }
-
-    for (int sink_x = start_x; sink_x <= end_x; sink_x++) {
-        for (int sink_y = start_y; sink_y <= end_y; sink_y++) {
-            int delta_x = abs(sink_x - source_x);
-            int delta_y = abs(sink_y - source_y);
-            if (!found_matrix[delta_x][delta_y]) {
-                add_delay_to_matrix(matrix, delta_x, delta_y, IMPOSSIBLE_DELTA);
-                VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
-                             source_x,
-                             source_y,
-                             from_layer_num,
-                             sink_x,
-                             sink_y,
-                             to_layer_num,
-                             IMPOSSIBLE_DELTA);
-            }
-        }
-    }
-}
-
-static float route_connection_delay(RouterDelayProfiler& route_profiler,
-                                    int source_x,
-                                    int source_y,
-                                    int source_layer,
-                                    int sink_x,
-                                    int sink_y,
-                                    int sink_layer,
-                                    const t_router_opts& router_opts,
-                                    bool measure_directconnect) {
-    //Routes between the source and sink locations and calculates the delay
-
-    // set to known value for debug purposes
-    float net_delay_value = IMPOSSIBLE_DELTA;
-
-    const auto& device_ctx = g_vpr_ctx.device();
-
-    bool successfully_routed = false;
-
-    // Get the rr nodes to route between
-    auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, source_layer}));
-    auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, sink_layer}));
-
-    for (int driver_ptc : best_driver_ptcs) {
-        VTR_ASSERT(driver_ptc != OPEN);
-        RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(source_layer, source_x, source_y, SOURCE, driver_ptc);
-
-        VTR_ASSERT(source_rr_node != RRNodeId::INVALID());
-
-        for (int sink_ptc : best_sink_ptcs) {
-            VTR_ASSERT(sink_ptc != OPEN);
-            RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(sink_layer, sink_x, sink_y, SINK, sink_ptc);
-
-            if (sink_rr_node == RRNodeId::INVALID())
-                continue;
-
-            if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
-                // Skip if we shouldn't measure direct connects and a direct connect exists
-                continue;
-            }
-
-            successfully_routed = route_profiler.calculate_delay(source_rr_node,
-                                                                 sink_rr_node,
-                                                                 router_opts,
-                                                                 &net_delay_value);
-
-            if (successfully_routed) break;
-        }
-        if (successfully_routed) break;
-    }
-
-    if (!successfully_routed) {
-        VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n",
-                     source_x, source_y, source_layer, sink_x, sink_y, sink_layer, net_delay_value);
-    }
-
-    return net_delay_value;
-}
-
-static float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
-    if (delays.empty()) {
-        return IMPOSSIBLE_DELTA;
-    }
-
-    if (delays.size() == 1) {
-        return delays[0];
-    }
-
-    VTR_ASSERT(delays.size() > 1);
-
-    float delay;
-
-    if (reducer == e_reducer::MIN) {
-        auto itr = std::min_element(delays.begin(), delays.end());
-        delay = *itr;
-    } else if (reducer == e_reducer::MAX) {
-        auto itr = std::max_element(delays.begin(), delays.end());
-        delay = *itr;
-    } else if (reducer == e_reducer::MEDIAN) {
-        std::stable_sort(delays.begin(), delays.end());
-        delay = vtr::median(delays.begin(), delays.end());
-    } else if (reducer == e_reducer::ARITHMEAN) {
-        delay = vtr::arithmean(delays.begin(), delays.end());
-    } else if (reducer == e_reducer::GEOMEAN) {
-        delay = vtr::geomean(delays.begin(), delays.end());
-    } else {
-        VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer");
-    }
-
-    return delay;
-}
-
-static void add_delay_to_matrix(vtr::Matrix<std::vector<float>>& matrix,
-                                int delta_x,
-                                int delta_y,
-                                float delay) {
-    if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
-        // Overwrite empty delta
-        matrix[delta_x][delta_y][0] = delay;
-    } else {
-        // Collect delta
-        matrix[delta_x][delta_y].push_back(delay);
-    }
-}
-
-static float find_neighboring_average(vtr::NdMatrix<float, 4>& matrix,
-                                      int from_layer,
-                                      t_physical_tile_loc to_tile_loc,
-                                      int max_distance) {
-    float sum = 0.f;
-    int num_samples = 0;
-    const int endx = matrix.end_index(2);
-    const int endy = matrix.end_index(3);
-
-    const int x = to_tile_loc.x;
-    const int y = to_tile_loc.y;
-    const int to_layer = to_tile_loc.layer_num;
-
-    for (int distance = 1; distance <= max_distance; ++distance) {
-        for (int delx = x - distance; delx <= x + distance; delx++) {
-            for (int dely = y - distance; dely <= y + distance; dely++) {
-                // Check distance constraint
-                if (abs(delx - x) + abs(dely - y) > distance) {
-                    continue;
-                }
-
-                //check out of bounds
-                if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) {
-                    continue;
-                }
-
-                if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) {
-                    continue;
-                }
-
-                sum += matrix[from_layer][to_layer][delx][dely];
-                num_samples++;
-            }
-        }
-
-        if (num_samples != 0) {
-            return sum / (float)num_samples;
-        }
-    }
-
-    return IMPOSSIBLE_DELTA;
-}
-
-/***************************************************************************************/
-
-vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
-                                                  const t_placer_opts& placer_opts,
-                                                  const t_router_opts& router_opts,
-                                                  bool measure_directconnect,
-                                                  int longest_length,
-                                                  bool is_flat) {
-    vtr::ScopedStartFinishTimer timer("Computing delta delays");
-    vtr::NdMatrix<float, 4> delta_delays = compute_delta_delays(route_profiler,
-                                                                placer_opts,
-                                                                router_opts,
-                                                                measure_directconnect,
-                                                                longest_length,
-                                                                is_flat);
-
-    const size_t num_elements = delta_delays.size();
-
-    // set uninitialized elements to infinity
-    for (size_t i = 0; i < num_elements; i++) {
-        if (delta_delays.get(i) == UNINITIALIZED_DELTA) {
-            delta_delays.get(i) = IMPOSSIBLE_DELTA;
-        }
-    }
-
-    fix_empty_coordinates(delta_delays);
-
-    fill_impossible_coordinates(delta_delays);
-
-    verify_delta_delays(delta_delays);
-
-    return delta_delays;
-}
-
-//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
-bool find_direct_connect_sample_locations(const t_direct_inf* direct,
-                                          t_physical_tile_type_ptr from_type,
-                                          int from_pin,
-                                          int from_pin_class,
-                                          t_physical_tile_type_ptr to_type,
-                                          int to_pin,
-                                          int to_pin_class,
-                                          RRNodeId& out_src_node,
-                                          RRNodeId& out_sink_node) {
-    VTR_ASSERT(from_type != nullptr);
-    VTR_ASSERT(to_type != nullptr);
-
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-    const auto& node_lookup = device_ctx.rr_graph.node_lookup();
-
-    //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets,
-    //and which has the appropriate pins
-    int from_x = -1;
-    int from_y = -1;
-    int from_sub_tile = -1;
-    int to_x = 0, to_y = 0, to_sub_tile = 0;
-    bool found = false;
-    int found_layer_num = -1;
-    //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums
-    for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) {
-        for (int x = 0; x < (int)grid.width() && !found; ++x) {
-            to_x = x + direct->x_offset;
-            if (to_x < 0 || to_x >= (int)grid.width()) continue;
-
-            for (int y = 0; y < (int)grid.height() && !found; ++y) {
-                if (grid.get_physical_type({x, y, layer_num}) != from_type) continue;
-
-                //Check that the from pin exists at this from location
-                //(with multi-width/height blocks pins may not exist at all locations)
-                bool from_pin_found = false;
-                if (direct->from_side != NUM_2D_SIDES) {
-                    RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side);
-                    from_pin_found = from_pin_rr.is_valid();
-                } else {
-                    from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty());
-                }
-                if (!from_pin_found) continue;
-
-                to_y = y + direct->y_offset;
-
-                if (to_y < 0 || to_y >= (int)grid.height()) continue;
-                if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue;
-
-                //Check that the from pin exists at this from location
-                //(with multi-width/height blocks pins may not exist at all locations)
-                bool to_pin_found = false;
-                if (direct->to_side != NUM_2D_SIDES) {
-                    RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side);
-                    to_pin_found = (to_pin_rr != RRNodeId::INVALID());
-                } else {
-                    to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty());
-                }
-                if (!to_pin_found) continue;
-
-                for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) {
-                    to_sub_tile = sub_tile_num + direct->sub_tile_offset;
-
-                    if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue;
-
-                    found = true;
-                    found_layer_num = layer_num;
-                    from_x = x;
-                    from_y = y;
-                    from_sub_tile = sub_tile_num;
-
-                    break;
-                }
-            }
-        }
-    }
-
-    if (!found) {
-        return false;
-    }
-
-    //Now have a legal instance of this direct connect
-    VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type);
-    VTR_ASSERT(from_sub_tile < from_type->capacity);
-
-    VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type);
-    VTR_ASSERT(to_sub_tile < to_type->capacity);
-
-    VTR_ASSERT(from_x + direct->x_offset == to_x);
-    VTR_ASSERT(from_y + direct->y_offset == to_y);
-    VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile);
-
-    // Find a source/sink RR node associated with the pins of the direct
-    {
-        RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class);
-        VTR_ASSERT(src_rr_candidate);
-        out_src_node = src_rr_candidate;
-    }
-
-    {
-        RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class);
-        VTR_ASSERT(sink_rr_candidate);
-        out_sink_node = sink_rr_candidate;
-    }
-
-    return true;
-}
-
-std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
-    std::vector<int> best_classes;
-
-    //Record any non-zero Fc pins
-    //
-    //Note that we track non-zero Fc pins, since certain Fc overrides
-    //may apply to only a subset of wire types. This ensures we record
-    //which pins can potentially connect to global routing.
-    std::unordered_set<int> non_zero_fc_pins;
-    for (const t_fc_specification& fc_spec : type->fc_specs) {
-        if (fc_spec.fc_value == 0) continue;
-
-        non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end());
-    }
-
-    // Collect all classes of matching type which connect to general routing
-    for (int i = 0; i < (int)type->class_inf.size(); i++) {
-        if (type->class_inf[i].type == pintype) {
-            //Check whether all pins in this class are ignored or have zero fc
-            bool any_pins_connect_to_general_routing = false;
-            for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) {
-                int pin = type->class_inf[i].pinlist[ipin];
-                //If the pin isn't ignored, and has a non-zero Fc to some general
-                //routing the class is suitable for delay profiling
-                if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) {
-                    any_pins_connect_to_general_routing = true;
-                    break;
-                }
-            }
-
-            // Skip if the pin class doesn't connect to general routing
-            if (!any_pins_connect_to_general_routing) continue;
-
-            // Record candidate class
-            best_classes.push_back(i);
-        }
-    }
-
-    // Sort classes so the largest pin class is first
-    auto cmp_class = [&](int lhs, int rhs) {
-        return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
-    };
-
-    std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class);
-
-    return best_classes;
-}
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
deleted file mode 100644
index 71ac632b149..00000000000
--- a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h
+++ /dev/null
@@ -1,56 +0,0 @@
-
-#pragma once
-
-#include "vtr_ndmatrix.h"
-#include "physical_types.h"
-#include "rr_graph_fwd.h"
-
-struct t_placer_opts;
-struct t_router_opts;
-class RouterDelayProfiler;
-
-vtr::NdMatrix<float, 4> compute_delta_delay_model(RouterDelayProfiler& route_profiler,
-                                                  const t_placer_opts& placer_opts,
-                                                  const t_router_opts& router_opts,
-                                                  bool measure_directconnect,
-                                                  int longest_length,
-                                                  bool is_flat);
-
-bool find_direct_connect_sample_locations(const t_direct_inf* direct,
-                                          t_physical_tile_type_ptr from_type,
-                                          int from_pin,
-                                          int from_pin_class,
-                                          t_physical_tile_type_ptr to_type,
-                                          int to_pin,
-                                          int to_pin_class,
-                                          RRNodeId& out_src_node,
-                                          RRNodeId& out_sink_node);
-
-/**
- * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity.
- *
- * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`)
- * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins
- * that connect to general routing, ensuring commonly used pins are chosen for delay profiling.
- *
- * @param pintype The type of pins to filter.
- * @param type Pointer to the physical tile type containing pin and class information.
- *
- * @return A vector of indices representing the selected pin classes. The classes are sorted
- *         in descending order based on the number of pins they contain.
- *
- * @details
- * - A pin class is eligible if its type matches `pintype` and it contains at least one pin
- *   that connects to general routing (non-zero Fc).
- * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`.
- * - Classes are sorted so that the class with the largest number of pins appears first.
- *   If multiple classes have the same pin count, their order depends on their initial appearance
- *   in the architecture file.
- *
- * @note
- * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded.
- * - The function ensures stability in sorting, preserving the input order for classes
- *   with the same number of pins.
- */
-
-std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type);
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.cpp b/vpr/src/place/timing/delay_model/delta_delay_model.cpp
deleted file mode 100644
index e8d56b09516..00000000000
--- a/vpr/src/place/timing/delay_model/delta_delay_model.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-
-#include "delta_delay_model.h"
-
-#include "compute_delta_delays_utils.h"
-
-#ifdef VTR_ENABLE_CAPNPROTO
-#    include "capnp/serialize.h"
-#    include "place_delay_model.capnp.h"
-#    include "ndmatrix_serdes.h"
-#    include "mmap_file.h"
-#    include "serdes_utils.h"
-#endif  // VTR_ENABLE_CAPNPROTO
-
-void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler,
-                              const t_placer_opts& placer_opts,
-                              const t_router_opts& router_opts,
-                              int longest_length) {
-    delays_ = compute_delta_delay_model(route_profiler,
-                                        placer_opts,
-                                        router_opts,
-                                        /*measure_directconnect=*/true,
-                                        longest_length,
-                                        is_flat_);
-}
-
-float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/,
-                             const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
-    int delta_x = std::abs(from_loc.x - to_loc.x);
-    int delta_y = std::abs(from_loc.y - to_loc.y);
-
-    return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
-}
-
-void DeltaDelayModel::dump_echo(std::string filepath) const {
-    FILE* f = vtr::fopen(filepath.c_str(), "w");
-    fprintf(f, "         ");
-    for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) {
-        for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) {
-            fprintf(f, " %9zu", from_layer_num);
-            fprintf(f, "\n");
-            for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
-                fprintf(f, " %9zu", dx);
-            }
-            fprintf(f, "\n");
-            for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) {
-                fprintf(f, "%9zu", dy);
-                for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) {
-                    fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]);
-                }
-                fprintf(f, "\n");
-            }
-        }
-    }
-    vtr::fclose(f);
-}
-
-void DeltaDelayModel::read(const std::string& file) {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-              "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
-#else
-
-    // MmapFile object creates an mmap of the specified path, and will munmap
-    // when the object leaves scope.
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-
-    // FlatArrayMessageReader is used to read the message from the data array
-    // provided by MmapFile.
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    // When reading capnproto files the Reader object to use is named
-    // <schema name>::Reader.
-    //
-    // Initially this object is an empty VprDeltaDelayModel.
-    VprDeltaDelayModel::Reader model;
-
-    // The reader.getRoot performs a cast from the generic capnproto to fit
-    // with the specified schema.
-    //
-    // Note that capnproto does not validate that the incoming data matches the
-    // schema.  If this property is required, some form of check would be
-    // required.
-    model = reader.getRoot<VprDeltaDelayModel>();
-
-    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
-        *out = in.getValue();
-    };
-
-    // ToNdMatrix is a generic function for converting a Matrix capnproto
-    // to a vtr::NdMatrix.
-    //
-    // The user must supply the matrix dimension (2 in this case), the source
-    // capnproto type (VprFloatEntry),
-    // target C++ type (flat), and a function to convert from the source capnproto
-    // type to the target C++ type (ToFloat).
-    //
-    // The second argument should be of type Matrix<X>::Reader where X is the
-    // capnproto element type.
-    ToNdMatrix<4, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
-#endif
-}
-
-void DeltaDelayModel::write(const std::string& file) const {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-              "DeltaDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
-#else
-
-    // MallocMessageBuilder object is the generate capnproto message builder,
-    // using malloc for buffer allocation.
-    ::capnp::MallocMessageBuilder builder;
-
-    // initRoot<X> returns a X::Builder object that can be used to set the
-    // fields in the message.
-    auto model = builder.initRoot<VprDeltaDelayModel>();
-
-    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
-        out->setValue(in);
-    };
-
-    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
-    // Matrix message.  It is the mirror function of ToNdMatrix described in
-    // read above.
-    auto delay_values = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delay_values, delays_, fromFloat);
-
-    // writeMessageToFile writes message to the specified file.
-    writeMessageToFile(file, &builder);
-#endif
-}
diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.h b/vpr/src/place/timing/delay_model/delta_delay_model.h
deleted file mode 100644
index c3ae0d83cf7..00000000000
--- a/vpr/src/place/timing/delay_model/delta_delay_model.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-#pragma once
-
-#include "place_delay_model.h"
-
-/**
- * @class DeltaDelayModel
- *
- * @brief A simple delay model based on the distance (delta) between block locations.
- */
-class DeltaDelayModel : public PlaceDelayModel {
-  public:
-    DeltaDelayModel(float min_cross_layer_delay,
-                    bool is_flat)
-        : cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-
-    DeltaDelayModel(float min_cross_layer_delay,
-                    vtr::NdMatrix<float, 4> delta_delays,
-                    bool is_flat)
-        : delays_(std::move(delta_delays))
-        , cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-
-    void compute(RouterDelayProfiler& router,
-                 const t_placer_opts& placer_opts,
-                 const t_router_opts& router_opts,
-                 int longest_length) override;
-
-    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
-
-    void dump_echo(std::string filepath) const override;
-
-    void read(const std::string& file) override;
-    void write(const std::string& file) const override;
-
-    const vtr::NdMatrix<float, 4>& delays() const {
-        return delays_;
-    }
-
-  private:
-    vtr::NdMatrix<float, 4> delays_; // [0..num_layers-1][0..max_dx][0..max_dy]
-    float cross_layer_delay_;
-
-    /// Indicates whether the router is a two-stage or run-flat
-    bool is_flat_;
-};
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp
deleted file mode 100644
index 6cbb2c7f654..00000000000
--- a/vpr/src/place/timing/delay_model/override_delay_model.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-
-#include "override_delay_model.h"
-
-#include "compute_delta_delays_utils.h"
-
-#ifdef VTR_ENABLE_CAPNPROTO
-#    include "capnp/serialize.h"
-#    include "place_delay_model.capnp.h"
-#    include "ndmatrix_serdes.h"
-#    include "mmap_file.h"
-#    include "serdes_utils.h"
-#endif  // VTR_ENABLE_CAPNPROTO
-
-void OverrideDelayModel::compute(RouterDelayProfiler& route_profiler,
-                                 const t_placer_opts& placer_opts,
-                                 const t_router_opts& router_opts,
-                                 int longest_length) {
-    auto delays = compute_delta_delay_model(route_profiler,
-                                            placer_opts,
-                                            router_opts,
-                                            /*measure_directconnect=*/false,
-                                            longest_length,
-                                            is_flat_);
-
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
-
-    compute_override_delay_model_(route_profiler, router_opts);
-}
-
-void OverrideDelayModel::compute_override_delay_model_(RouterDelayProfiler& route_profiler,
-                                                       const t_router_opts& router_opts) {
-    const auto& device_ctx = g_vpr_ctx.device();
-    t_router_opts router_opts2 = router_opts;
-    router_opts2.astar_fac = 0.f;
-    router_opts2.astar_offset = 0.f;
-
-    // Look at all the direct connections that exist, and add overrides to delay model
-    for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) {
-        const t_direct_inf* direct = &device_ctx.arch->directs[idirect];
-
-        InstPort from_port = parse_inst_port(direct->from_pin);
-        InstPort to_port = parse_inst_port(direct->to_pin);
-
-        t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types);
-        t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types);
-
-        int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1;
-        VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from");
-
-        //We now walk through all the connections associated with the current direct specification, measure
-        //their delay and specify that value as an override in the delay model.
-        //
-        //Note that we need to check every connection in the direct to cover the case where the pins are not
-        //equivalent.
-        //
-        //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK
-        //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in
-        //sampled_rr_pairs and skipping them if they occur multiple times.
-        int missing_instances = 0;
-        int missing_paths = 0;
-        std::set<std::pair<RRNodeId, RRNodeId>> sampled_rr_pairs;
-        for (int iconn = 0; iconn < num_conns; ++iconn) {
-            //Find the associated pins
-            int from_pin = from_type->find_pin(from_port.port_name(), from_port.port_low_index() + iconn);
-            int to_pin = to_type->find_pin(to_port.port_name(), to_port.port_low_index() + iconn);
-
-            VTR_ASSERT(from_pin != OPEN);
-            VTR_ASSERT(to_pin != OPEN);
-
-            int from_pin_class = from_type->find_pin_class(from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
-            VTR_ASSERT(from_pin_class != OPEN);
-
-            int to_pin_class = to_type->find_pin_class(to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
-            VTR_ASSERT(to_pin_class != OPEN);
-
-            bool found_sample_points;
-            RRNodeId src_rr, sink_rr;
-            found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr);
-
-            if (!found_sample_points) {
-                ++missing_instances;
-                continue;
-            }
-
-            //If some of the source/sink ports are logically equivalent we may have already
-            //sampled the associated source/sink pair and don't need to do so again
-            if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;
-
-            float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
-            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);
-
-            if (found_routing_path) {
-                set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
-            } else {
-                ++missing_paths;
-            }
-
-            //Record that we've sampled this pair of source and sink nodes
-            sampled_rr_pairs.insert({src_rr, sink_rr});
-        }
-
-        VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str());
-        VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str());
-    }
-}
-
-const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
-    return base_delay_model_.get();
-}
-
-float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const {
-    // First check to if there is an override delay value
-    const auto& device_ctx = g_vpr_ctx.device();
-    const auto& grid = device_ctx.grid;
-
-    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc);
-    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc);
-
-    t_override override_key;
-    override_key.from_type = from_type_ptr->index;
-    override_key.from_class = from_type_ptr->pin_class[from_pin];
-    override_key.to_type = to_type_ptr->index;
-    override_key.to_class = to_type_ptr->pin_class[to_pin];
-
-    //Delay overrides may be different for +/- delta so do not use
-    //an absolute delta for the look-up
-    override_key.delta_x = to_loc.x - from_loc.x;
-    override_key.delta_y = to_loc.y - from_loc.y;
-
-    float delay_val = std::numeric_limits<float>::quiet_NaN();
-    auto override_iter = delay_overrides_.find(override_key);
-    if (override_iter != delay_overrides_.end()) {
-        //Found an override
-        delay_val = override_iter->second;
-    } else {
-        //Fall back to the base delay model if no override was found
-        delay_val = base_delay_model_->delay(from_loc, from_pin, to_loc, to_pin);
-    }
-
-    return delay_val;
-}
-
-void OverrideDelayModel::set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay_val) {
-    t_override override_key;
-    override_key.from_type = from_type;
-    override_key.from_class = from_class;
-    override_key.to_type = to_type;
-    override_key.to_class = to_class;
-    override_key.delta_x = delta_x;
-    override_key.delta_y = delta_y;
-
-    auto res = delay_overrides_.insert(std::make_pair(override_key, delay_val));
-    if (!res.second) {                 //Key already exists
-        res.first->second = delay_val; //Overwrite existing delay
-    }
-}
-
-void OverrideDelayModel::dump_echo(std::string filepath) const {
-    base_delay_model_->dump_echo(filepath);
-
-    FILE* f = vtr::fopen(filepath.c_str(), "a");
-
-    fprintf(f, "\n");
-    fprintf(f, "# Delay Overrides\n");
-    auto& device_ctx = g_vpr_ctx.device();
-    for (auto kv : delay_overrides_) {
-        auto override_key = kv.first;
-        float delay_val = kv.second;
-        fprintf(f, "from_type: %s to_type: %s from_pin_class: %d to_pin_class: %d delta_x: %d delta_y: %d -> delay: %g\n",
-                device_ctx.physical_tile_types[override_key.from_type].name.c_str(),
-                device_ctx.physical_tile_types[override_key.to_type].name.c_str(),
-                override_key.from_class,
-                override_key.to_class,
-                override_key.delta_x,
-                override_key.delta_y,
-                delay_val);
-    }
-
-    vtr::fclose(f);
-}
-
-float OverrideDelayModel::get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const {
-    t_override key;
-    key.from_type = from_type;
-    key.from_class = from_class;
-    key.to_type = to_type;
-    key.to_class = to_class;
-    key.delta_x = delta_x;
-    key.delta_y = delta_y;
-
-    auto iter = delay_overrides_.find(key);
-    if (iter == delay_overrides_.end()) {
-        VPR_THROW(VPR_ERROR_PLACE, "Key not found.");
-    }
-    return iter->second;
-}
-
-void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
-    base_delay_model_ = std::move(base_delay_model_obj);
-}
-
-void OverrideDelayModel::read(const std::string& file) {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-          "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-          "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
-#else
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
-        *out = in.getValue();
-    };
-
-    vtr::NdMatrix<float, 4> delays;
-    auto model = reader.getRoot<VprOverrideDelayModel>();
-    ToNdMatrix<4, VprFloatEntry, float>(&delays, model.getDelays(), toFloat);
-
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
-
-    // Reading non-scalar capnproto fields is roughly equivilant to using
-    // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
-    auto overrides = model.getDelayOverrides();
-    std::vector<std::pair<t_override, float> > overrides_arr(overrides.size());
-    for (size_t i = 0; i < overrides.size(); ++i) {
-        const auto& elem = overrides[i];
-        overrides_arr[i].first.from_type = elem.getFromType();
-        overrides_arr[i].first.to_type = elem.getToType();
-        overrides_arr[i].first.from_class = elem.getFromClass();
-        overrides_arr[i].first.to_class = elem.getToClass();
-        overrides_arr[i].first.delta_x = elem.getDeltaX();
-        overrides_arr[i].first.delta_y = elem.getDeltaY();
-
-        overrides_arr[i].second = elem.getDelay();
-    }
-
-    delay_overrides_ = vtr::make_flat_map2(std::move(overrides_arr));
-#endif
-}
-
-void OverrideDelayModel::write(const std::string& file) const {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-              "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
-#else
-    ::capnp::MallocMessageBuilder builder;
-    auto model = builder.initRoot<VprOverrideDelayModel>();
-
-    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
-        out->setValue(in);
-    };
-
-    auto delays = model.getDelays();
-    FromNdMatrix<4, VprFloatEntry, float>(&delays, base_delay_model_->delays(), fromFloat);
-
-    // Non-scalar capnproto fields should be first initialized with
-    // init<field  name>(count), and then accessed from the returned
-    // std::vector-like Builder object (specifically capnp::List<X>::Builder).
-    auto overrides = model.initDelayOverrides(delay_overrides_.size());
-    auto dst_iter = overrides.begin();
-    for (const auto& src : delay_overrides_) {
-        auto elem = *dst_iter++;
-        elem.setFromType(src.first.from_type);
-        elem.setToType(src.first.to_type);
-        elem.setFromClass(src.first.from_class);
-        elem.setToClass(src.first.to_class);
-        elem.setDeltaX(src.first.delta_x);
-        elem.setDeltaY(src.first.delta_y);
-
-        elem.setDelay(src.second);
-    }
-
-    writeMessageToFile(file, &builder);
-#endif
-}
-
diff --git a/vpr/src/place/timing/delay_model/override_delay_model.h b/vpr/src/place/timing/delay_model/override_delay_model.h
deleted file mode 100644
index 5965261c272..00000000000
--- a/vpr/src/place/timing/delay_model/override_delay_model.h
+++ /dev/null
@@ -1,112 +0,0 @@
-
-#pragma once
-
-#include "place_delay_model.h"
-#include "delta_delay_model.h"
-
-class OverrideDelayModel : public PlaceDelayModel {
-  public:
-    OverrideDelayModel(float min_cross_layer_delay,
-                       bool is_flat)
-        : cross_layer_delay_(min_cross_layer_delay)
-        , is_flat_(is_flat) {}
-
-    void compute(RouterDelayProfiler& route_profiler,
-                 const t_placer_opts& placer_opts,
-                 const t_router_opts& router_opts,
-                 int longest_length) override;
-
-    /**
-     * @brief returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the
-     * specified from and to pins
-     */
-    float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override;
-
-    void dump_echo(std::string filepath) const override;
-
-    void read(const std::string& file) override;
-    void write(const std::string& file) const override;
-
-  public: //Mutators
-    void set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model);
-    const DeltaDelayModel* base_delay_model() const;
-    float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const;
-    void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay);
-
-  private:
-    std::unique_ptr<DeltaDelayModel> base_delay_model_;
-    /// Minimum delay of cross-layer connections
-    float cross_layer_delay_;
-
-    /// Indicates whether the router is a two-stage or run-flat
-    bool is_flat_;
-
-    void compute_override_delay_model_(RouterDelayProfiler& router,
-                                       const t_router_opts& router_opts);
-
-    /**
-     * @brief Structure that allows delays to be queried from the delay model.
-     *
-     * Delay is calculated given the origin physical tile, the origin
-     * pin, the destination physical tile, and the destination pin.
-     * This structure encapsulates all these information.
-     *
-     *   @param from_type, to_type
-     *              Physical tile index (for easy array access)
-     *   @param from_class, to_class
-     *              The class that the pins belongs to.
-     *   @param to_x, to_y
-     *              The horizontal and vertical displacement
-     *              between two physical tiles.
-     */
-    struct t_override {
-        short from_type;
-        short to_type;
-        short from_class;
-        short to_class;
-        short delta_x;
-        short delta_y;
-
-        /**
-         * @brief Comparison operator designed for performance.
-         *
-         * Operator< is important since t_override serves as the key into the
-         * map structure delay_overrides_. A default comparison operator would
-         * not be inlined by the compiler.
-         *
-         * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
-         * is required for operator< to be inlined by compiler. Proper inlining of
-         * the function reduces place time by around 5%.
-         *
-         * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
-         */
-        friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
-            const short* left = reinterpret_cast<const short*>(&lhs);
-            const short* right = reinterpret_cast<const short*>(&rhs);
-            constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short);
-            return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS);
-        }
-    };
-
-    /**
-     * @brief Map data structure that returns delay values according to
-     *        specific delay model queries.
-     *
-     * Delay model queries are provided by the t_override structure, which
-     * encapsulates the information regarding the origin and the destination.
-     */
-    vtr::flat_map2<t_override, float> delay_overrides_;
-
-    /**
-     * operator< treats memory layout of t_override as an array of short.
-     * This requires all members of t_override are shorts and there is no
-     * padding between members of t_override.
-     */
-    static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
-    static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts");
-    static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts");
-};
\ No newline at end of file
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.cpp b/vpr/src/place/timing/delay_model/place_delay_model.cpp
deleted file mode 100644
index 04267e0e5f1..00000000000
--- a/vpr/src/place/timing/delay_model/place_delay_model.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * @file place_delay_model.cpp
- * @brief This file implements all the class methods and individual
- *        routines related to the placer delay model.
- */
-
-#include "place_delay_model.h"
-
-#include "globals.h"
-#include "router_lookahead_map.h"
-#include "placer_state.h"
-#include "vpr_error.h"
-
-/**
- * @brief Returns the delay of one point to point connection.
- *
- * Only estimate delay for signals routed through the inter-block routing network.
- * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
- */
-float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
-                                      const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
-                                      ClusterNetId net_id,
-                                      int ipin) {
-    const auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    float delay_source_to_sink = 0.;
-
-    if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
-        ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
-        ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
-
-        ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
-        ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
-
-        int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
-        int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
-
-        t_pl_loc source_block_loc = block_locs[source_block].loc;
-        t_pl_loc sink_block_loc = block_locs[sink_block].loc;
-
-        /**
-         * This heuristic only considers delta_x and delta_y, a much better
-         * heuristic would be to to create a more comprehensive lookup table.
-         *
-         * In particular this approach does not accurately capture the effect
-         * of fast carry-chain connections.
-         */
-        delay_source_to_sink = delay_model->delay({source_block_loc.x, source_block_loc.y, source_block_loc.layer}, source_block_ipin,
-                                                  {sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer}, sink_block_ipin);
-        if (delay_source_to_sink < 0) {
-            VPR_ERROR(VPR_ERROR_PLACE,
-                      "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d,%d) to %s (at %d,%d,%d)\n"
-                      "in comp_td_single_connection_delay: Delay is less than 0\n",
-                      block_type_pin_index_to_name(physical_tile_type(source_block_loc), source_block_ipin, false).c_str(),
-                      source_block_loc.x, source_block_loc.y, source_block_loc.layer,
-                      block_type_pin_index_to_name(physical_tile_type(sink_block_loc), sink_block_ipin, false).c_str(),
-                      sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer,
-                      delay_source_to_sink);
-        }
-    }
-
-    return (delay_source_to_sink);
-}
-
-///@brief Recompute all point to point delays, updating `connection_delay` matrix.
-void comp_td_connection_delays(const PlaceDelayModel* delay_model,
-                               PlacerState& placer_state) {
-    const auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& p_timing_ctx = placer_state.mutable_timing();
-    auto& block_locs = placer_state.block_locs();
-    auto& connection_delay = p_timing_ctx.connection_delay;
-
-    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
-        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
-            connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, block_locs, net_id, ipin);
-        }
-    }
-}
diff --git a/vpr/src/place/timing/delay_model/place_delay_model.h b/vpr/src/place/timing/delay_model/place_delay_model.h
deleted file mode 100644
index 27c89591071..00000000000
--- a/vpr/src/place/timing/delay_model/place_delay_model.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * @file place_delay_model.h
- * @brief This file contains all the class and function declarations related to
- *        the placer delay model. For implementations, see place_delay_model.cpp.
- */
-
-#pragma once
-
-#include "vtr_ndmatrix.h"
-#include "vtr_flat_map.h"
-#include "vpr_types.h"
-#include "router_delay_profiling.h"
-
-#ifndef __has_attribute
-#    define __has_attribute(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#if defined(COMPILER_GCC) && defined(NDEBUG)
-#    define ALWAYS_INLINE inline __attribute__((__always_inline__))
-#elif defined(COMPILER_MSVC) && defined(NDEBUG)
-#    define ALWAYS_INLINE __forceinline
-#elif __has_attribute(always_inline)
-#    define ALWAYS_INLINE __attribute__((always_inline)) // clang
-#else
-#    define ALWAYS_INLINE inline
-#endif
-
-///@brief Forward declarations.
-class PlaceDelayModel;
-class PlacerState;
-
-///@brief Returns the delay of one point to point connection.
-float comp_td_single_connection_delay(const PlaceDelayModel* delay_model,
-                                      const vtr::vector_map<ClusterBlockId, t_block_loc>& block_locs,
-                                      ClusterNetId net_id,
-                                      int ipin);
-
-///@brief Recompute all point to point delays, updating `connection_delay` matrix.
-void comp_td_connection_delays(const PlaceDelayModel* delay_model,
-                               PlacerState& placer_state);
-
-///@brief Abstract interface to a placement delay model.
-class PlaceDelayModel {
-  public:
-    virtual ~PlaceDelayModel() = default;
-
-    ///@brief Computes place delay model.
-    virtual void compute(RouterDelayProfiler& route_profiler,
-                         const t_placer_opts& placer_opts,
-                         const t_router_opts& router_opts,
-                         int longest_length)
-        = 0;
-
-    /**
-     * @brief Returns the delay estimate between the specified block pins.
-     *
-     * Either compute or read methods must be invoked before invoking delay.
-     */
-    virtual float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const = 0;
-
-    ///@brief Dumps the delay model to an echo file.
-    virtual void dump_echo(std::string filename) const = 0;
-
-    /**
-     * @brief Write place delay model to specified file.
-     *
-     * May be unimplemented, in which case method should throw an exception.
-     */
-    virtual void write(const std::string& file) const = 0;
-
-    /**
-     * @brief Read place delay model from specified file.
-     *
-     * May be unimplemented, in which case method should throw an exception.
-     */
-    virtual void read(const std::string& file) = 0;
-};
-
-
-
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.cpp b/vpr/src/place/timing/delay_model/simple_delay_model.cpp
deleted file mode 100644
index dac18890366..00000000000
--- a/vpr/src/place/timing/delay_model/simple_delay_model.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-
-#include "simple_delay_model.h"
-
-#ifdef VTR_ENABLE_CAPNPROTO
-#    include "capnp/serialize.h"
-#    include "place_delay_model.capnp.h"
-#    include "ndmatrix_serdes.h"
-#    include "mmap_file.h"
-#    include "serdes_utils.h"
-#endif  // VTR_ENABLE_CAPNPROTO
-
-void SimpleDelayModel::compute(RouterDelayProfiler& route_profiler,
-                               const t_placer_opts& /*placer_opts*/,
-                               const t_router_opts& /*router_opts*/,
-                               int /*longest_length*/) {
-    const auto& grid = g_vpr_ctx.device().grid;
-    const size_t num_physical_tile_types = g_vpr_ctx.device().physical_tile_types.size();
-    const size_t num_layers = grid.get_num_layers();
-
-    // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height]
-    // The second index related to the layer that the source location is on and the third index is for the sink layer
-    delays_ = vtr::NdMatrix<float, 5>({num_physical_tile_types,
-                                       num_layers,
-                                       num_layers,
-                                       grid.width(),
-                                       grid.height()});
-
-    for (size_t physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) {
-        for (size_t from_layer = 0; from_layer < num_layers; ++from_layer) {
-            for (size_t to_layer = 0; to_layer < num_layers; ++to_layer) {
-                for (size_t dx = 0; dx < grid.width(); ++dx) {
-                    for (size_t dy = 0; dy < grid.height(); ++dy) {
-                        float min_delay = route_profiler.get_min_delay(physical_tile_type_idx,
-                                                                       from_layer,
-                                                                       to_layer,
-                                                                       dx,
-                                                                       dy);
-                        delays_[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay;
-                    }
-                }
-            }
-        }
-    }
-}
-
-float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
-    int delta_x = std::abs(from_loc.x - to_loc.x);
-    int delta_y = std::abs(from_loc.y - to_loc.y);
-
-    int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index;
-    return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y];
-}
-
-void SimpleDelayModel::read(const std::string& file) {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-              "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
-#else
-    // MmapFile object creates an mmap of the specified path, and will munmap
-    // when the object leaves scope.
-    MmapFile f(file);
-
-    /* Increase reader limit to 1G words to allow for large files. */
-    ::capnp::ReaderOptions opts = default_large_capnp_opts();
-
-    // FlatArrayMessageReader is used to read the message from the data array
-    // provided by MmapFile.
-    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
-
-    // When reading capnproto files the Reader object to use is named
-    // <schema name>::Reader.
-    //
-    // Initially this object is an empty VprDeltaDelayModel.
-    VprDeltaDelayModel::Reader model;
-
-    // The reader.getRoot performs a cast from the generic capnproto to fit
-    // with the specified schema.
-    //
-    // Note that capnproto does not validate that the incoming data matches the
-    // schema.  If this property is required, some form of check would be
-    // required.
-    model = reader.getRoot<VprDeltaDelayModel>();
-
-    auto toFloat = [](float* out, const VprFloatEntry::Reader& in) -> void {
-        *out = in.getValue();
-    };
-
-    // ToNdMatrix is a generic function for converting a Matrix capnproto
-    // to a vtr::NdMatrix.
-    //
-    // The user must supply the matrix dimension (5 in this case), the source
-    // capnproto type (VprFloatEntry),
-    // target C++ type (flat), and a function to convert from the source capnproto
-    // type to the target C++ type (ToFloat).
-    //
-    // The second argument should be of type Matrix<X>::Reader where X is the
-    // capnproto element type.
-    ToNdMatrix<5, VprFloatEntry, float>(&delays_, model.getDelays(), toFloat);
-#endif
-}
-
-void SimpleDelayModel::write(const std::string& file) const {
-#ifndef VTR_ENABLE_CAPNPROTO
-    VPR_THROW(VPR_ERROR_PLACE,
-              "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
-#else
-    // MallocMessageBuilder object generates capnproto message builder,
-    // using malloc for buffer allocation.
-    ::capnp::MallocMessageBuilder builder;
-
-    // initRoot<X> returns a X::Builder object that can be used to set the
-    // fields in the message.
-    auto model = builder.initRoot<VprDeltaDelayModel>();
-
-    auto fromFloat = [](VprFloatEntry::Builder* out, const float& in) -> void {
-        out->setValue(in);
-    };
-
-    // FromNdMatrix is a generic function for converting a vtr::NdMatrix to a
-    // Matrix message.  It is the mirror function of ToNdMatrix described in
-    // read above.
-    auto delay_values = model.getDelays();
-    FromNdMatrix<5, VprFloatEntry, float>(&delay_values, delays_, fromFloat);
-
-    // writeMessageToFile writes message to the specified file.
-    writeMessageToFile(file, &builder);
-#endif
-}
diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.h b/vpr/src/place/timing/delay_model/simple_delay_model.h
deleted file mode 100644
index 25dce08c4fc..00000000000
--- a/vpr/src/place/timing/delay_model/simple_delay_model.h
+++ /dev/null
@@ -1,39 +0,0 @@
-
-#pragma once
-
-#include "place_delay_model.h"
-
-/**
- * @class SimpleDelayModel
- * @brief A simple delay model based on the information stored in router lookahead
- * This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router
- */
-class SimpleDelayModel : public PlaceDelayModel {
-  public:
-    SimpleDelayModel() {}
-
-    /// @brief Use the information in the router lookahead to fill the delay matrix instead of running the router
-    void compute(RouterDelayProfiler& router,
-                 const t_placer_opts& placer_opts,
-                 const t_router_opts& router_opts,
-                 int longest_length) override;
-
-    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
-
-    void dump_echo(std::string /*filepath*/) const override {}
-
-    void read(const std::string& /*file*/) override;
-    void write(const std::string& /*file*/) const override;
-
-  private:
-    /**
-     * @brief The matrix to store the minimum delay between different points on different layers.
-     *
-     *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers
-     *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs
-     *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers
-     *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1.
-     *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular.
-     */
-    vtr::NdMatrix<float, 5> delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy]
-};
\ No newline at end of file

From 40274a1feaa2e6adb1450b5cc4aa90b4b7cba42e Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Mon, 20 Jan 2025 09:54:50 -0500
Subject: [PATCH 34/39] fix missing terminating " character error

---
 vpr/src/place/delay_model/override_delay_model.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpr/src/place/delay_model/override_delay_model.cpp b/vpr/src/place/delay_model/override_delay_model.cpp
index 61acd2937b5..3d98fc56f3e 100644
--- a/vpr/src/place/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/delay_model/override_delay_model.cpp
@@ -221,7 +221,7 @@ void OverrideDelayModel::read(const std::string& file) {
 
     base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
 
-    // Reading non-scalar capnproto fields is roughly equivilant to using
+    // Reading non-scalar capnproto fields is roughly equivalent to using
     // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
     auto overrides = model.getDelayOverrides();
     std::vector<std::pair<t_override, float> > overrides_arr(overrides.size());
@@ -245,7 +245,7 @@ void OverrideDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     ::capnp::MallocMessageBuilder builder;
     auto model = builder.initRoot<VprOverrideDelayModel>();

From e383c988e1d5f7a5bb652a46ff76e397aa660268 Mon Sep 17 00:00:00 2001
From: soheil <soheilqs@gmail.com>
Date: Mon, 20 Jan 2025 11:44:13 -0500
Subject: [PATCH 35/39] fix two other missing terminating " character error
 when capnproto is disabled

---
 vpr/src/place/delay_model/override_delay_model.cpp | 4 ++--
 vpr/src/place/delay_model/simple_delay_model.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpr/src/place/delay_model/override_delay_model.cpp b/vpr/src/place/delay_model/override_delay_model.cpp
index 3d98fc56f3e..c7f8ac10e81 100644
--- a/vpr/src/place/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/delay_model/override_delay_model.cpp
@@ -202,8 +202,8 @@ void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> b
 void OverrideDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
-          "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-          "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
+              "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     MmapFile f(file);
 
diff --git a/vpr/src/place/delay_model/simple_delay_model.cpp b/vpr/src/place/delay_model/simple_delay_model.cpp
index 1fcd86eca64..dac18890366 100644
--- a/vpr/src/place/delay_model/simple_delay_model.cpp
+++ b/vpr/src/place/delay_model/simple_delay_model.cpp
@@ -55,7 +55,7 @@ void SimpleDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     // MmapFile object creates an mmap of the specified path, and will munmap
     // when the object leaves scope.
@@ -104,7 +104,7 @@ void SimpleDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.\");
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
     // MallocMessageBuilder object generates capnproto message builder,
     // using malloc for buffer allocation.

From 433dc96bcd00e418d972705a9a458463c36addbc Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Mon, 20 Jan 2025 13:41:16 -0500
Subject: [PATCH 36/39] =?UTF-8?q?fix=20invalid=20use=20of=20incomplete=20t?=
 =?UTF-8?q?ype=20=E2=80=98const=20class=20PlacerCriticalities=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vpr/src/place/delay_model/delta_delay_model.cpp | 2 +-
 vpr/src/place/timing/PlacerCriticalities.h      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vpr/src/place/delay_model/delta_delay_model.cpp b/vpr/src/place/delay_model/delta_delay_model.cpp
index e8d56b09516..f88e4a45003 100644
--- a/vpr/src/place/delay_model/delta_delay_model.cpp
+++ b/vpr/src/place/delay_model/delta_delay_model.cpp
@@ -58,7 +58,7 @@ void DeltaDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
 
     // MmapFile object creates an mmap of the specified path, and will munmap
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index b03bda4eb87..b08499d6ac4 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -6,6 +6,7 @@
 #include "clustered_netlist_utils.h"
 #include "place_delay_model.h"
 #include "vpr_net_pins_matrix.h"
+#include "PlacerCriticalities.h"
 
 /**
  * @brief Saves the placement criticality parameters

From 928ac041dd1eca9df1dd1d69c1afc20a700b74ac Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Mon, 20 Jan 2025 15:26:42 -0500
Subject: [PATCH 37/39] void cast file argument to fix unused parameter warning

---
 vpr/src/place/delay_model/delta_delay_model.cpp    | 4 +++-
 vpr/src/place/delay_model/override_delay_model.cpp | 2 ++
 vpr/src/place/delay_model/simple_delay_model.cpp   | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/vpr/src/place/delay_model/delta_delay_model.cpp b/vpr/src/place/delay_model/delta_delay_model.cpp
index f88e4a45003..b58dda8a453 100644
--- a/vpr/src/place/delay_model/delta_delay_model.cpp
+++ b/vpr/src/place/delay_model/delta_delay_model.cpp
@@ -56,6 +56,7 @@ void DeltaDelayModel::dump_echo(std::string filepath) const {
 
 void DeltaDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
               "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
@@ -106,9 +107,10 @@ void DeltaDelayModel::read(const std::string& file) {
 
 void DeltaDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "DeltaDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
-              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.";
+              "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
 #else
 
     // MallocMessageBuilder object is the generate capnproto message builder,
diff --git a/vpr/src/place/delay_model/override_delay_model.cpp b/vpr/src/place/delay_model/override_delay_model.cpp
index c7f8ac10e81..83141fb4bad 100644
--- a/vpr/src/place/delay_model/override_delay_model.cpp
+++ b/vpr/src/place/delay_model/override_delay_model.cpp
@@ -201,6 +201,7 @@ void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> b
 
 void OverrideDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
               "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
@@ -243,6 +244,7 @@ void OverrideDelayModel::read(const std::string& file) {
 
 void OverrideDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "OverrideDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
               "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
diff --git a/vpr/src/place/delay_model/simple_delay_model.cpp b/vpr/src/place/delay_model/simple_delay_model.cpp
index dac18890366..72a0d017d1e 100644
--- a/vpr/src/place/delay_model/simple_delay_model.cpp
+++ b/vpr/src/place/delay_model/simple_delay_model.cpp
@@ -53,6 +53,7 @@ float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pi
 
 void SimpleDelayModel::read(const std::string& file) {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::read is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
               "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");
@@ -102,6 +103,7 @@ void SimpleDelayModel::read(const std::string& file) {
 
 void SimpleDelayModel::write(const std::string& file) const {
 #ifndef VTR_ENABLE_CAPNPROTO
+    (void)file;
     VPR_THROW(VPR_ERROR_PLACE,
               "SimpleDelayModel::write is disabled because VTR_ENABLE_CAPNPROTO=OFF. "
               "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable.");

From cd7474ff11869290aa27912ba44f1ecd0d7d13f5 Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Mon, 20 Jan 2025 16:41:02 -0500
Subject: [PATCH 38/39] =?UTF-8?q?fix=20error:=20invalid=20use=20of=20incom?=
 =?UTF-8?q?plete=20type=20=E2=80=98const=20class=20PlacerCriticalities?=
 =?UTF-8?q?=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vpr/src/place/move_utils.cpp               | 1 +
 vpr/src/place/timing/PlacerCriticalities.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index 6e79bdaac4d..893e19243d8 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -10,6 +10,7 @@
 
 #include "place_constraints.h"
 #include "placer_state.h"
+#include "PlacerCriticalities.h"
 
 //f_placer_breakpoint_reached is used to stop the placer when a breakpoint is reached.
 // When this flag is true, it stops the placer after the current perturbation. Thus, when a breakpoint is reached, this flag is set to true.
diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h
index b08499d6ac4..b03bda4eb87 100644
--- a/vpr/src/place/timing/PlacerCriticalities.h
+++ b/vpr/src/place/timing/PlacerCriticalities.h
@@ -6,7 +6,6 @@
 #include "clustered_netlist_utils.h"
 #include "place_delay_model.h"
 #include "vpr_net_pins_matrix.h"
-#include "PlacerCriticalities.h"
 
 /**
  * @brief Saves the placement criticality parameters

From 56568302c145d3dbdf583390621e0e3d579078ee Mon Sep 17 00:00:00 2001
From: soheilshahrouz <soheilqs@gmail.com>
Date: Tue, 21 Jan 2025 14:24:35 -0500
Subject: [PATCH 39/39] =?UTF-8?q?fix=20error:=20=E2=80=98ConnectionRouter?=
 =?UTF-8?q?=E2=80=99=20was=20not=20declared=20in=20this=20scope?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vpr/test/test_connection_router.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 2b584daedc3..5319cc05818 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -8,6 +8,8 @@
 #include "globals.h"
 #include "net_delay.h"
 #include "place_and_route.h"
+#include "connection_router.h"
+#include "router_delay_profiling.h"
 
 static constexpr const char kArchFile[] = "../../vtr_flow/arch/timing/k6_frac_N10_mem32K_40nm.xml";
 static constexpr int kMaxHops = 10;