OpenFreeEnergy · hannahbaumann · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -52,6 +52,33 @@ jobs:
         run: |
           python -m pip install --no-deps .
 
+      - name: Cache Pooch data
+        uses: actions/cache@v4
+        with:
+          path: |
+            # Linux cache location
+            ~/.cache/openfe_analysis
+            # macOS cache location
+            ~/Library/Caches/openfe_analysis
+          key: pooch-${{ matrix.os }}-v2
+
+      - name: "Download Zenodo data"
+        run: |
+          python - <<'EOF'
+          import pooch
+          from openfe_analysis.tests.conftest import ZENODO_DOI, ZENODO_FILES
+
+          zenodo = pooch.create(
+              path=pooch.os_cache('openfe_analysis'),
+              base_url=ZENODO_DOI,
+              registry=ZENODO_FILES,
+          )
+
+          for fname in ZENODO_FILES:
+              zenodo.fetch(fname, processor=pooch.Untar())
+
+          EOF
+
       - name: "Test imports"
         run: |
           python -Ic "import openfe_analysis; print(openfe_analysis.__version__)"

diff --git a/environment.yml b/environment.yml
@@ -11,8 +11,9 @@ dependencies:
   - pyyaml
   # for testing
   - coverage
-  - pooch
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-rerunfailures
+  - pip:
+    - git+https://github.com/fatiando/pooch@main  # related to https://github.com/fatiando/pooch/issues/502
diff --git a/src/openfe_analysis/reader.py b/src/openfe_analysis/reader.py
@@ -193,5 +193,7 @@ def _reopen(self):
         self._frame_index = -1
 
     def close(self):
-        if self._dataset_owner:
-            self._dataset.close()
+        if self._dataset is not None:
+            if self._dataset_owner:
+                self._dataset.close()
+            self._dataset = None
diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py
@@ -97,87 +97,88 @@ def gather_rms_data(
         "protein_2D_RMSD": [],
     }
 
-    ds = nc.Dataset(dataset)
-    n_lambda = ds.dimensions["state"].size
-
-    # If you're using a new multistate nc file, you need to account for
-    # the position skip rate.
-    if hasattr(ds, "PositionInterval"):
-        n_frames = len(range(0, ds.dimensions["iteration"].size, ds.PositionInterval))
-    else:
-        n_frames = ds.dimensions["iteration"].size
-
-    if skip is None:
-        # find skip that would give ~500 frames of output
-        # max against 1 to avoid skip=0 case
-        skip = max(n_frames // 500, 1)
-
-    pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda)
-
-    u_top = mda.Universe(pdb_topology)
-
-    for i in range(n_lambda):
-        # cheeky, but we can read the PDB topology once and reuse per universe
-        # this then only hits the PDB file once for all replicas
-        u = make_Universe(u_top._topology, ds, state=i)
-
-        prot = u.select_atoms("protein and name CA")
-        ligand = u.select_atoms("resname UNK")
-
-        # save coordinates for 2D RMSD matrix
-        # TODO: Some smart guard to avoid allocating a silly amount of memory?
-        prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32)
-
-        prot_start = prot.positions
-        # prot_weights = prot.masses / np.mean(prot.masses)
-        ligand_start = ligand.positions
-        ligand_initial_com = ligand.center_of_mass()
-        ligand_weights = ligand.masses / np.mean(ligand.masses)
-
-        this_protein_rmsd = []
-        this_ligand_rmsd = []
-        this_ligand_wander = []
-
-        for ts_i, ts in enumerate(u.trajectory[::skip]):
-            pb.update()
+    # Open the NetCDF file safely using a context manager
+    with nc.Dataset(dataset) as ds:
+        n_lambda = ds.dimensions["state"].size
+
+        # If you're using a new multistate nc file, you need to account for
+        # the position skip rate.
+        if hasattr(ds, "PositionInterval"):
+            n_frames = len(range(0, ds.dimensions["iteration"].size, ds.PositionInterval))
+        else:
+            n_frames = ds.dimensions["iteration"].size
+
+        if skip is None:
+            # find skip that would give ~500 frames of output
+            # max against 1 to avoid skip=0 case
+            skip = max(n_frames // 500, 1)
+
+        pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda)
+
+        u_top = mda.Universe(pdb_topology)
+
+        for i in range(n_lambda):
+            # cheeky, but we can read the PDB topology once and reuse per universe
+            # this then only hits the PDB file once for all replicas
+            u = make_Universe(u_top._topology, ds, state=i)
+
+            prot = u.select_atoms("protein and name CA")
+            ligand = u.select_atoms("resname UNK")
+
+            # save coordinates for 2D RMSD matrix
+            # TODO: Some smart guard to avoid allocating a silly amount of memory?
+            prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32)
+
+            prot_start = prot.positions
+            # prot_weights = prot.masses / np.mean(prot.masses)
+            ligand_start = ligand.positions
+            ligand_initial_com = ligand.center_of_mass()
+            ligand_weights = ligand.masses / np.mean(ligand.masses)
+
+            this_protein_rmsd = []
+            this_ligand_rmsd = []
+            this_ligand_wander = []
+
+            for ts_i, ts in enumerate(u.trajectory[::skip]):
+                pb.update()
+
+                if prot:
+                    prot2d[ts_i, :, :] = prot.positions
+                    this_protein_rmsd.append(
+                        rms.rmsd(
+                            prot.positions,
+                            prot_start,
+                            None,  # prot_weights,
+                            center=False,
+                            superposition=False,
+                        )
+                    )
+                if ligand:
+                    this_ligand_rmsd.append(
+                        rms.rmsd(
+                            ligand.positions,
+                            ligand_start,
+                            ligand_weights,
+                            center=False,
+                            superposition=False,
+                        )
+                    )
+                    this_ligand_wander.append(
+                        # distance between start and current ligand position
+                        # ignores PBC, but we've already centered the traj
+                        mda.lib.distances.calc_bonds(ligand.center_of_mass(), ligand_initial_com)
+                    )
 
             if prot:
-                prot2d[ts_i, :, :] = prot.positions
-                this_protein_rmsd.append(
-                    rms.rmsd(
-                        prot.positions,
-                        prot_start,
-                        None,  # prot_weights,
-                        center=False,
-                        superposition=False,
-                    )
-                )
+                # can ignore weights here as it's all Ca
+                rmsd2d = twoD_RMSD(prot2d, w=None)  # prot_weights)
+                output["protein_RMSD"].append(this_protein_rmsd)
+                output["protein_2D_RMSD"].append(rmsd2d)
             if ligand:
-                this_ligand_rmsd.append(
-                    rms.rmsd(
-                        ligand.positions,
-                        ligand_start,
-                        ligand_weights,
-                        center=False,
-                        superposition=False,
-                    )
-                )
-                this_ligand_wander.append(
-                    # distance between start and current ligand position
-                    # ignores PBC, but we've already centered the traj
-                    mda.lib.distances.calc_bonds(ligand.center_of_mass(), ligand_initial_com)
-                )
-
-        if prot:
-            # can ignore weights here as it's all Ca
-            rmsd2d = twoD_RMSD(prot2d, w=None)  # prot_weights)
-            output["protein_RMSD"].append(this_protein_rmsd)
-            output["protein_2D_RMSD"].append(rmsd2d)
-        if ligand:
-            output["ligand_RMSD"].append(this_ligand_rmsd)
-            output["ligand_wander"].append(this_ligand_wander)
-
-        output["time(ps)"] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt)
+                output["ligand_RMSD"].append(this_ligand_rmsd)
+                output["ligand_wander"].append(this_ligand_wander)
+
+            output["time(ps)"] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt)
 
     return output
 

diff --git a/src/openfe_analysis/tests/conftest.py b/src/openfe_analysis/tests/conftest.py
@@ -1,49 +1,62 @@
+import pathlib
 from importlib import resources
 
-import pathlib
 import pooch
 import pytest
 
-POOCH_CACHE = pooch.os_cache("openfe_analysis")
+ZENODO_DOI = "doi:10.5281/zenodo.18378051"
+
+ZENODO_FILES = {
+    "openfe_analysis_simulation_output.tar.gz": "md5:7f0babaac3dc8f7dd2db63cb79dff00f",
+    "openfe_analysis_skipped.tar.gz": "md5:ac42219bde9da3641375adf3a9ddffbf",
+}
+
+POOCH_CACHE = pathlib.Path(pooch.os_cache("openfe_analysis"))
+POOCH_CACHE.mkdir(parents=True, exist_ok=True)
+
 ZENODO_RBFE_DATA = pooch.create(
     path=POOCH_CACHE,
-    base_url="doi:10.5281/zenodo.17916322",
-    registry={
-       "openfe_analysis_simulation_output.tar.gz":"md5:09752f2c4e5b7744d8afdee66dbd1414",
-       "openfe_analysis_skipped.tar.gz": "md5:3840d044299caacc4ccd50e6b22c0880",
-    },
+    base_url=ZENODO_DOI,
+    registry=ZENODO_FILES,
 )
 
+
+def _fetch_and_untar(dirname: str) -> pathlib.Path:
+    ZENODO_RBFE_DATA.fetch(f"{dirname}.tar.gz", processor=pooch.Untar())
+    cached_dir = pathlib.Path(f"{POOCH_CACHE}/{dirname}.tar.gz.untar/{dirname}")
+    return cached_dir
+
+
 @pytest.fixture(scope="session")
 def rbfe_output_data_dir() -> pathlib.Path:
-    ZENODO_RBFE_DATA.fetch("openfe_analysis_simulation_output.tar.gz", processor=pooch.Untar())
-    result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_simulation_output.tar.gz.untar/openfe_analysis_simulation_output/"
-    return result_dir
+    cached_dir = _fetch_and_untar("openfe_analysis_simulation_output")
+    return cached_dir
+
 
 @pytest.fixture(scope="session")
 def rbfe_skipped_data_dir() -> pathlib.Path:
-    ZENODO_RBFE_DATA.fetch("openfe_analysis_skipped.tar.gz", processor=pooch.Untar())
-    result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_skipped.tar.gz.untar/openfe_analysis_skipped/"
-    return result_dir
+    cached_dir = _fetch_and_untar("openfe_analysis_skipped")
+    return cached_dir
+
 
 @pytest.fixture(scope="session")
 def simulation_nc(rbfe_output_data_dir) -> pathlib.Path:
-    return rbfe_output_data_dir/"simulation.nc"
+    return rbfe_output_data_dir / "simulation.nc"
 
 
 @pytest.fixture(scope="session")
 def simulation_skipped_nc(rbfe_skipped_data_dir) -> pathlib.Path:
-    return rbfe_skipped_data_dir/"simulation.nc"
+    return rbfe_skipped_data_dir / "simulation.nc"
 
 
 @pytest.fixture(scope="session")
 def hybrid_system_pdb(rbfe_output_data_dir) -> pathlib.Path:
-    return rbfe_output_data_dir/"hybrid_system.pdb"
+    return rbfe_output_data_dir / "hybrid_system.pdb"
 
 
 @pytest.fixture(scope="session")
-def hybrid_system_skipped_pdb(rbfe_skipped_data_dir)->pathlib.Path:
-    return rbfe_skipped_data_dir/"hybrid_system.pdb"
+def hybrid_system_skipped_pdb(rbfe_skipped_data_dir) -> pathlib.Path:
+    return rbfe_skipped_data_dir / "hybrid_system.pdb"
 
 
 @pytest.fixture(scope="session")