From f01e1edc1fc319375b61dcb38b8f713a8f34ee68 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 6 May 2025 17:46:53 +0200 Subject: [PATCH 01/11] script to add multi-band & photo-z cats --- notebooks/demo_add_bands.py | 186 ++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 notebooks/demo_add_bands.py diff --git a/notebooks/demo_add_bands.py b/notebooks/demo_add_bands.py new file mode 100644 index 0000000..d20b04e --- /dev/null +++ b/notebooks/demo_add_bands.py @@ -0,0 +1,186 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# # Demo notebook to add (u,g,i,z,z2) bands to an r-band catalogue + +# %reload_ext autoreload +# %autoreload 2 + +# + +import os +import numpy as np +import numpy.lib.recfunctions as rfn + +from timeit import default_timer as timer +import tqdm +import healsparse as hsp +from astropy.io import fits + +from sp_validation import run_joint_cat as sp_joint +# - + +# Create instance of object +obj = sp_joint.BaseCat() + + +# + +# Set parameters +base = "unions_shapepipe_comprehensive" +year = 2024 +ver = "v1.5.c.P37" + +obj._params = {} + +obj._params["input_path"] = f"{base}_{year}_{ver}.hdf5" +obj._params["output_path"] = f"{base}_ugriz_{year}_{ver}.hdf5" +obj._params["verbose"] = True + +# + +#path_bands = "../UNIONS5000" +path_bands = "." +subdir_base = "UNIONS." + +path_base = subdir_base +path_suff = "_SP_ugriz_photoz_ext.cat" + +# NUMBER key in photo-z catalogue +key_num = "SeqNr" + +keys_mag = [f"MAG_GAAP_0p7_{band}" for band in ("u", "g", "r", "i", "z", "z2")] + +keys = ["Z_B", "Z_B_MIN", "Z_B_MAX", "T_B"] + keys_mag + +hdu_no = 1 +# - + +# ## Run + +# + +# Check parameter validity +#obj.check_params() + +# Update parameters (here: strings to list) +#obj.update_params() +# - + +# Read catalogue +dat = obj.read_cat(load_into_memory=False, mode="r") + +# + +# Get tile IDs +tile_IDs_raw = dat["TILE_ID"] +tile_IDs_raw_list = list(set(tile_IDs_raw)) + +# Transform (back) to 2x3 digits by zero-padding +tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] +# - + +import copy + +# + +dist_sqr = {} +do_dist_check = False + +n_rows = len(dat) + +# Loop over tile IDs +for idx, tile_ID in tqdm.tqdm(enumerate(tile_IDs), total=len(tile_IDs), disable=True): + + print(idx/len(tile_ID), tile_ID) + + src = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_ID}{path_suff}") + dst = os.path.join(path_bands, f".", f"{path_base}{tile_ID}{path_suff}") + + print(" Copy FITS file:", src, end=" ") + start = timer() + copy.copy(src, dst) + end = timer() + print(f" {end - start:.1f}s") + + path = dst + + print(" Read data from file:", path, end=" ") + start = timer() + hdu_list = fits.open(path) + dat_mb = hdu_list[hdu_no].data + end = timer() + print(f" {end - start:.1f}s") + + print(" Get numbers", end=" ") + start = timer() + numbers = dat_mb[key_num] + end = timer() + print(f" {end - start:.1f}s") + + print(" Identify matches", end= " ") + start = timer() + # Select indices in dat with current tile ID + w = dat["TILE_ID"] == tile_IDs_raw_list[idx] + indices = np.where(w)[0] + end = timer() + print(f" {end - start:.1f}s") + + print(" Compute distance check", end=" ") + start = timer() + # Compute coordinate distances as matching check + if do_dist_check: + dist_sqr[TILE_ID] = sum( + (dat[indices]["RA"] - dat_mb["ALPHA_J2000"]) ** 2 + + (dat[indices]["Dec"] - dat_mb["DELTA_J2000"]) ** 2 + ) / len(dat_mb) + end = timer() + print(f" {end - start:.1f}s") + + if idx == 0: + print(" Create new combined array", end=" ") + start = timer() + # Get dtype from multiband keys + dtype_keys = np.dtype([dt for dt in dat_mb.dtype.descr if dt[0] in keys]) + + # Create structured array from multi-band columns + + # Create new empty array with rows as original data and new multi-band columns + new_empty = np.zeros(n_rows, dtype=dtype_keys) + end = timer() + print(f" {end - start:.1f}s") + + print(" Merge empty to original", end=" ") + start = timer() + # Combine with original data (slow) + combined = rfn.merge_arrays([dat, new_empty], flatten=True) + end = timer() + print(f" {end - start:.1f}s") + + print( " Copy mb data to combined array", end=" ") + start = timer() + # Copy multi-band values to combined array + for key in keys: + combined[indices][key] = dat_mb[key] + end = timer() + print(f" {end - start:.1f}s") + + hdu_list.close() + + if idx == 5: + break + + +# - + +dist_sqr + +obj.write_hdf5_file(combined) + +# Close input HDF5 catalogue file +obj.close_hd5() From a0bbce35e58c9eb928159ee5ccaffa76112a42b6 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 6 May 2025 19:49:55 +0200 Subject: [PATCH 02/11] Script to add multi-band empty data --- notebooks/create_shear_mb_empty.py | 142 +++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 notebooks/create_shear_mb_empty.py diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py new file mode 100644 index 0000000..aaf6517 --- /dev/null +++ b/notebooks/create_shear_mb_empty.py @@ -0,0 +1,142 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# # Demo notebook to add (u,g,i,z,z2) bands to an r-band catalogue + +# %reload_ext autoreload +# %autoreload 2 + +# + +import os +import numpy as np +import numpy.lib.recfunctions as rfn + +from timeit import default_timer as timer +import tqdm +import healsparse as hsp +from astropy.io import fits + +from sp_validation import run_joint_cat as sp_joint +# - + +# Create instance of object +obj = sp_joint.BaseCat() + + +# + +# Set parameters +base = "unions_shapepipe_comprehensive" +year = 2024 +ver = "v1.5.c.P37" + +obj._params = {} + +obj._params["input_path"] = f"{base}_{year}_{ver}.hdf5" +obj._params["verbose"] = True + +# + +path_bands = "../UNIONS5000" +subdir_base = "UNIONS." + +path_base = subdir_base +path_suff = "_SP_ugriz_photoz_ext.cat" + +# NUMBER key in photo-z catalogue +key_num = "SeqNr" + +keys_mag = [f"MAG_GAAP_0p7_{band}" for band in ("u", "g", "r", "i", "z", "z2")] + +keys = ["Z_B", "Z_B_MIN", "Z_B_MAX", "T_B"] + keys_mag + +hdu_no = 1 +# - + +# ## Run + +# + +# Check parameter validity +#obj.check_params() + +# Update parameters (here: strings to list) +#obj.update_params() +# - + +# Read catalogue +dat = obj.read_cat(load_into_memory=False, mode="r") + +# + +# Get tile IDs +tile_IDs_raw = dat["TILE_ID"] +tile_IDs_raw_list = list(set(tile_IDs_raw)) + +# Transform (back) to 2x3 digits by zero-padding +tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] +# - + +from shutil import copyfile + + +def get_dtype_keys(keys,path=None, hdu_no=1): + + if path is None: + + dtype = np.dtype([(key, np.float32) for key in keys]) + + else: + + print(" Read data from file:", path, end=" ") + start = timer() + hdu_list = fits.open(path) + dat_mb = hdu_list[hdu_no].data + dtype = np.dtype([dt for dt in dat_mb.dtype.descr if dt[0] in keys]) + end = timer() + print(f" {end - start:.1f}s") + + return dtype + + +# + +# Get dtype of new keys + +path = None +#path = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_IDs[0]}{path_suff}") + +dtype_keys = get_dtype_keys(keys, path=path, hdu_no=hdu_no) + +# + +# Create empty array with new keys +# Initialise with -199 to later be able to check for unfilled values + +print(" Create new combined array", end=" ") +start = timer() +new_empty = np.full(n_rows, -199, dtype=dtype_keys) +end = timer() +print(f" {end - start:.1f}s") + +# + +# Merge with original data + +print(" Merge empty to original", end=" ") +start = timer() +combined = rfn.merge_arrays([dat, new_empty], flatten=True) +end = timer() +print(f" {end - start:.1f}s") +# - + +obj._params["output_path"] = f"{base}_ugriz_empty_{year}_{ver}.hdf5" +obj.write_hdf5_file(combined) + + +# Close input HDF5 catalogue file +obj.close_hd5() From 6148bb260a4753cf33127670f1be30be0c33a3f4 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 6 May 2025 20:10:41 +0200 Subject: [PATCH 03/11] script to add mb info to empty columns --- notebooks/create_shear_mb_empty.py | 2 +- notebooks/demo_add_bands.py | 101 ++++++++++------ notebooks/demo_add_bands_to_empty.py | 165 +++++++++++++++++++++++++++ 3 files changed, 234 insertions(+), 34 deletions(-) create mode 100644 notebooks/demo_add_bands_to_empty.py diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py index aaf6517..ea6d995 100644 --- a/notebooks/create_shear_mb_empty.py +++ b/notebooks/create_shear_mb_empty.py @@ -134,7 +134,7 @@ def get_dtype_keys(keys,path=None, hdu_no=1): print(f" {end - start:.1f}s") # - -obj._params["output_path"] = f"{base}_ugriz_empty_{year}_{ver}.hdf5" +obj._params["output_path"] = f"{base}_empty_ugriz_{year}_{ver}.hdf5" obj.write_hdf5_file(combined) diff --git a/notebooks/demo_add_bands.py b/notebooks/demo_add_bands.py index d20b04e..2036604 100644 --- a/notebooks/demo_add_bands.py +++ b/notebooks/demo_add_bands.py @@ -43,12 +43,10 @@ obj._params = {} obj._params["input_path"] = f"{base}_{year}_{ver}.hdf5" -obj._params["output_path"] = f"{base}_ugriz_{year}_{ver}.hdf5" obj._params["verbose"] = True # + -#path_bands = "../UNIONS5000" -path_bands = "." +path_bands = "../UNIONS5000" subdir_base = "UNIONS." path_base = subdir_base @@ -86,11 +84,63 @@ tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] # - -import copy +from shutil import copyfile + + +def get_dtype_keys(keys,path=None, hdu_no=1): + + if path is None: + + dtype = np.dtype([(key, np.float32) for key in keys]) + + else: + + print(" Read data from file:", path, end=" ") + start = timer() + hdu_list = fits.open(path) + dat_mb = hdu_list[hdu_no].data + dtype = np.dtype([dt for dt in dat_mb.dtype.descr if dt[0] in keys]) + end = timer() + print(f" {end - start:.1f}s") + + return dtype + + +# + +# Get dtype of new keys + +path = None +#path = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_IDs[0]}{path_suff}") + +dtype_keys = get_dtype_keys(keys, path=path, hdu_no=hdu_no) + +# + +# Create empty array with new keys +# Initialise with -199 to later be able to check for unfilled values + +print(" Create new combined array", end=" ") +start = timer() +new_empty = np.full(n_rows, -199, dtype=dtype_keys) +end = timer() +print(f" {end - start:.1f}s") + +# + +# Merge with original data + +print(" Merge empty to original", end=" ") +start = timer() +combined = rfn.merge_arrays([dat, new_empty], flatten=True) +end = timer() +print(f" {end - start:.1f}s") +# - + +obj._params["output_path"] = f"{base}_ugriz_empty_{year}_{ver}.hdf5" +obj.write_hdf5_file(combined) # + dist_sqr = {} do_dist_check = False +do_copy = False n_rows = len(dat) @@ -100,16 +150,21 @@ print(idx/len(tile_ID), tile_ID) src = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_ID}{path_suff}") - dst = os.path.join(path_bands, f".", f"{path_base}{tile_ID}{path_suff}") + dst = os.path.join(f".", f"{path_base}{tile_ID}{path_suff}") - print(" Copy FITS file:", src, end=" ") - start = timer() - copy.copy(src, dst) - end = timer() - print(f" {end - start:.1f}s") - - path = dst - + if do_copy: + if not os.path.exists(src): + print(" Copy FITS file:", src, end=" ") + start = timer() + copyfile(src, dst) + end = timer() + print(f" {end - start:.1f}s") + else: + print(" FITS file already exists:", src) + path = dst + else: + path = src + print(" Read data from file:", path, end=" ") start = timer() hdu_list = fits.open(path) @@ -141,26 +196,6 @@ ) / len(dat_mb) end = timer() print(f" {end - start:.1f}s") - - if idx == 0: - print(" Create new combined array", end=" ") - start = timer() - # Get dtype from multiband keys - dtype_keys = np.dtype([dt for dt in dat_mb.dtype.descr if dt[0] in keys]) - - # Create structured array from multi-band columns - - # Create new empty array with rows as original data and new multi-band columns - new_empty = np.zeros(n_rows, dtype=dtype_keys) - end = timer() - print(f" {end - start:.1f}s") - - print(" Merge empty to original", end=" ") - start = timer() - # Combine with original data (slow) - combined = rfn.merge_arrays([dat, new_empty], flatten=True) - end = timer() - print(f" {end - start:.1f}s") print( " Copy mb data to combined array", end=" ") start = timer() diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py new file mode 100644 index 0000000..66f9901 --- /dev/null +++ b/notebooks/demo_add_bands_to_empty.py @@ -0,0 +1,165 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# # Demo notebook to add multi-band (u,g,i,z,z2) and photo-z data to r-band + empty multi-band catalogue. +# +# (Fill empty columns.) + +# %reload_ext autoreload +# %autoreload 2 + +# + +import os +import numpy as np +import numpy.lib.recfunctions as rfn + +from timeit import default_timer as timer +import tqdm +import healsparse as hsp +from astropy.io import fits + +from sp_validation import run_joint_cat as sp_joint +# - + +# Create instance of object +obj = sp_joint.BaseCat() + + +# + +# Set parameters +base = "unions_shapepipe_comprehensive_empty" +year = 2024 +ver = "v1.5.c.P37" + +obj._params = {} + +obj._params["input_path"] = f"{base}_{year}_{ver}.hdf5" +obj._params["verbose"] = True + +# + +path_bands = "../UNIONS5000" +subdir_base = "UNIONS." + +path_base = subdir_base +path_suff = "_SP_ugriz_photoz_ext.cat" + +# NUMBER key in photo-z catalogue +key_num = "SeqNr" + +keys_mag = [f"MAG_GAAP_0p7_{band}" for band in ("u", "g", "r", "i", "z", "z2")] + +keys = ["Z_B", "Z_B_MIN", "Z_B_MAX", "T_B"] + keys_mag + +hdu_no = 1 +# - + +# ## Run + +# + +# Check parameter validity +#obj.check_params() + +# Update parameters (here: strings to list) +#obj.update_params() +# - + +# Read catalogue +dat = obj.read_cat(load_into_memory=False, mode="r") + +# + +# Get tile IDs +tile_IDs_raw = dat["TILE_ID"] +tile_IDs_raw_list = list(set(tile_IDs_raw)) + +# Transform (back) to 2x3 digits by zero-padding +tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] +# - + +from shutil import copyfile + +# + +dist_sqr = {} +do_dist_check = False +do_copy = False + +# Loop over tile IDs +for idx, tile_ID in tqdm.tqdm(enumerate(tile_IDs), total=len(tile_IDs), disable=False): + + #print(idx/len(tile_ID), tile_ID) + + src = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_ID}{path_suff}") + dst = os.path.join(f".", f"{path_base}{tile_ID}{path_suff}") + + if do_copy: + if not os.path.exists(src): + print(" Copy FITS file:", src, end=" ") + start = timer() + copyfile(src, dst) + end = timer() + print(f" {end - start:.1f}s") + else: + print(" FITS file already exists:", src) + path = dst + else: + path = src + + #print(" Read data from file:", path, end=" ") + #start = timer() + hdu_list = fits.open(path) + dat_mb = hdu_list[hdu_no].data + #end = timer() + #print(f" {end - start:.1f}s") + + #print(" Get numbers", end=" ") + #start = timer() + numbers = dat_mb[key_num] + #end = timer() + #print(f" {end - start:.1f}s") + + #print(" Identify matches", end= " ") + #start = timer() + # Select indices in dat with current tile ID + w = dat["TILE_ID"] == tile_IDs_raw_list[idx] + indices = np.where(w)[0] + #end = timer() + #print(f" {end - start:.1f}s") + + # Compute coordinate distances as matching check + if do_dist_check: + #print(" Compute distance check", end=" ") + #start = timer() + dist_sqr[TILE_ID] = sum( + (dat[indices]["RA"] - dat_mb["ALPHA_J2000"]) ** 2 + + (dat[indices]["Dec"] - dat_mb["DELTA_J2000"]) ** 2 + ) / len(dat_mb) + #end = timer() + #print(f" {end - start:.1f}s") + + #print( " Copy mb data to combined array", end=" ") + #start = timer() + # Copy multi-band values to combined array + for key in keys: + combined[indices][key] = dat_mb[key] + #end = timer() + #print(f" {end - start:.1f}s") + + hdu_list.close() +# - + +dist_sqr + +obj.write_hdf5_file(combined) + +# Close input HDF5 catalogue file +obj.close_hd5() From a48d46907289c538337c75cdf108bf00a274f01e Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 6 May 2025 18:13:20 +0000 Subject: [PATCH 04/11] cleaned up mb scripts --- notebooks/create_shear_mb_empty.py | 8 ++++---- notebooks/demo_add_bands.py | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py index aaf6517..370f599 100644 --- a/notebooks/create_shear_mb_empty.py +++ b/notebooks/create_shear_mb_empty.py @@ -74,17 +74,17 @@ # Read catalogue dat = obj.read_cat(load_into_memory=False, mode="r") +n_rows = len(dat) # + # Get tile IDs -tile_IDs_raw = dat["TILE_ID"] -tile_IDs_raw_list = list(set(tile_IDs_raw)) +#tile_IDs_raw = dat["TILE_ID"] +#tile_IDs_raw_list = list(set(tile_IDs_raw)) # Transform (back) to 2x3 digits by zero-padding -tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] +#tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] # - -from shutil import copyfile def get_dtype_keys(keys,path=None, hdu_no=1): diff --git a/notebooks/demo_add_bands.py b/notebooks/demo_add_bands.py index d20b04e..88cc9d1 100644 --- a/notebooks/demo_add_bands.py +++ b/notebooks/demo_add_bands.py @@ -47,8 +47,7 @@ obj._params["verbose"] = True # + -#path_bands = "../UNIONS5000" -path_bands = "." +path_bands = "../UNIONS5000" subdir_base = "UNIONS." path_base = subdir_base @@ -86,11 +85,10 @@ tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] # - -import copy - # + dist_sqr = {} do_dist_check = False +do_copy = False n_rows = len(dat) @@ -102,13 +100,15 @@ src = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_ID}{path_suff}") dst = os.path.join(path_bands, f".", f"{path_base}{tile_ID}{path_suff}") - print(" Copy FITS file:", src, end=" ") - start = timer() - copy.copy(src, dst) - end = timer() - print(f" {end - start:.1f}s") - - path = dst + if do_copy: + print(" Copy FITS file:", src, end=" ") + start = timer() + copy.copy(src, dst) + end = timer() + print(f" {end - start:.1f}s") + path = dst + else: + path = src print(" Read data from file:", path, end=" ") start = timer() @@ -131,10 +131,10 @@ end = timer() print(f" {end - start:.1f}s") - print(" Compute distance check", end=" ") - start = timer() # Compute coordinate distances as matching check if do_dist_check: + print(" Compute distance check", end=" ") + start = timer() dist_sqr[TILE_ID] = sum( (dat[indices]["RA"] - dat_mb["ALPHA_J2000"]) ** 2 + (dat[indices]["Dec"] - dat_mb["DELTA_J2000"]) ** 2 From 6796af9eeb2d78566446d62223c3e1d3723f5412 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Wed, 7 May 2025 06:54:32 +0000 Subject: [PATCH 05/11] fixed output path bug --- notebooks/demo_add_bands_to_empty.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py index 66f9901..b07e993 100644 --- a/notebooks/demo_add_bands_to_empty.py +++ b/notebooks/demo_add_bands_to_empty.py @@ -38,13 +38,14 @@ # + # Set parameters -base = "unions_shapepipe_comprehensive_empty" +base = "unions_shapepipe_comprehensive_empty_ugriz" year = 2024 ver = "v1.5.c.P37" obj._params = {} obj._params["input_path"] = f"{base}_{year}_{ver}.hdf5" +obj._params["output_path"] = obj._params["input_path"].replace("_empty", "") obj._params["verbose"] = True # + @@ -150,16 +151,15 @@ #start = timer() # Copy multi-band values to combined array for key in keys: - combined[indices][key] = dat_mb[key] + dat[indices][key] = dat_mb[key] #end = timer() #print(f" {end - start:.1f}s") hdu_list.close() # - -dist_sqr -obj.write_hdf5_file(combined) +obj.write_hdf5_file(dat) # Close input HDF5 catalogue file obj.close_hd5() From 9c417b368bea468bf01e7703662fad7a69aa9b16 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Wed, 7 May 2025 14:05:47 +0200 Subject: [PATCH 06/11] apply mask on v1.5 --- notebooks/demo_apply_hsp_masks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/demo_apply_hsp_masks.py b/notebooks/demo_apply_hsp_masks.py index f1f1c69..a9190cf 100644 --- a/notebooks/demo_apply_hsp_masks.py +++ b/notebooks/demo_apply_hsp_masks.py @@ -52,9 +52,9 @@ # + # Set parameters -obj._params["input_path"] = "unions_shapepipe_comprehensive_2024_v1.4.c.hdf5" -obj._params["output_path"] = "unions_shapepipe_comprehensive_struc_2024_v1.4.c.hdf5" -obj._params["mask_dir"] = f"{os.environ['HOME']}/v1.4.x/masks" +obj._params["input_path"] = "unions_shapepipe_comprehensive_2024_v1.5.c.hdf5" +obj._params["output_path"] = "unions_shapepipe_comprehensive_struc_2024_v1.5.c.hdf5" +obj._params["mask_dir"] = f"{os.environ['HOME']}/v1.5.x/masks" obj._params["nside"] = 131072 obj._params["file_base"] = "mask_r_" obj._params["bits"] = bits From 0c6e0dc53544cbd001c7b373fab35afa0bf12fbe Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Fri, 9 May 2025 16:17:12 +0000 Subject: [PATCH 07/11] np -> hdf5 for multiband empty dat --- .gitignore | 6 +++ notebooks/create_shear_mb_empty.py | 78 +++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 259fbb5..ac15da6 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,9 @@ notebooks/calibrate_comprehensive_cat.ipynb notebooks/demo_apply_hsp_masks.py notebooks/demo_apply_hsp_masks.ipynb notebooks/plot_footprints.ipynb +notebooks/demo_calibrate_minimal_cat.ipynb +notebooks/demo_calibrate_minimal_cat.ipynb +notebooks/leakage_minimal.ipynb +notebooks/mask_fits2hsparse.ipynb +notebooks/mask_fits2hsparse_test.ipynb +notebooks/create_shear_mb_empty.ipynb diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py index b742697..dea4ba5 100644 --- a/notebooks/create_shear_mb_empty.py +++ b/notebooks/create_shear_mb_empty.py @@ -7,9 +7,9 @@ # format_version: '1.5' # jupytext_version: 1.15.1 # kernelspec: -# display_name: Python 3 +# display_name: sp_validation # language: python -# name: python3 +# name: sp_validation # --- # # Demo notebook to add (u,g,i,z,z2) bands to an r-band catalogue @@ -21,6 +21,7 @@ import os import numpy as np import numpy.lib.recfunctions as rfn +import h5py from timeit import default_timer as timer import tqdm @@ -38,7 +39,7 @@ # Set parameters base = "unions_shapepipe_comprehensive" year = 2024 -ver = "v1.5.c.P37" +ver = "v1.5.c" obj._params = {} @@ -76,16 +77,6 @@ dat = obj.read_cat(load_into_memory=False, mode="r") n_rows = len(dat) -# + -# Get tile IDs -#tile_IDs_raw = dat["TILE_ID"] -#tile_IDs_raw_list = list(set(tile_IDs_raw)) - -# Transform (back) to 2x3 digits by zero-padding -#tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] -# - - - def get_dtype_keys(keys,path=None, hdu_no=1): @@ -114,29 +105,68 @@ def get_dtype_keys(keys,path=None, hdu_no=1): dtype_keys = get_dtype_keys(keys, path=path, hdu_no=hdu_no) + +# - + +def strip_h5py_metadata_dtype(dat_dtype, dat_ext_dtype): + cleaned_fields = [] + for name, dt in dat_dtype.descr + dat_ext_dtype.descr: + # If dt is a tuple (e.g., ('S7', {'h5py_encoding': 'ascii'})) + if isinstance(dt, tuple): + cleaned_fields.append((name, dt[0])) # keep only the base dtype string + else: + cleaned_fields.append((name, dt)) # use as-is + return cleaned_fields + + # + # Create empty array with new keys # Initialise with -199 to later be able to check for unfilled values -print(" Create new combined array", end=" ") -start = timer() -new_empty = np.full(n_rows, -199, dtype=dtype_keys) +total_bytes = n_rows * np.dtype(dtype_keys).itemsize +print(" Create new combined array.", end=" ") +print(f"Expected size = {total_bytes / 1_048_576:.2f} MB", end=" ") +start = timer() + +obj._params["output_path"] = f"{base}_empty_ugriz_{year}_{ver}.hdf5" +dtype_sp = dat.dtype +dtype_comb = strip_h5py_metadata_dtype(dtype_sp, dtype_keys) +with h5py.File(obj._params["output_path"], "w") as f: + + # Create new dataset + dset_comb = f.create_dataset( + "dat_comb", + shape=(n_rows,), + dtype=dtype_comb, + ) + + # Copy old data field-by-field + for name in dtype_sp.names: + dset_comb[name] = dat[name] + + # Fill new fields with default value (-199) + for name, _ in dtype_keys: + dset_comb[name] = -199 + +#new_empty = np.full(n_rows, -199, dtype=dtype_keys) + end = timer() print(f" {end - start:.1f}s") + + # + # Merge with original data -print(" Merge empty to original", end=" ") -start = timer() -combined = rfn.merge_arrays([dat, new_empty], flatten=True) -end = timer() -print(f" {end - start:.1f}s") +#print(" Merge empty to original", end=" ") +#start = timer() +#combined = rfn.merge_arrays([dat, new_empty], flatten=True) +#end = timer() +#print(f" {end - start:.1f}s") # - -obj._params["output_path"] = f"{base}_empty_ugriz_{year}_{ver}.hdf5" -obj.write_hdf5_file(combined) +# obj.write_hdf5_file(combined) # Close input HDF5 catalogue file -obj.close_hd5() +# obj.close_hd5() From f00af5101e97cdee98ccdf33479df7314857648d Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Thu, 26 Jun 2025 14:13:37 +0200 Subject: [PATCH 08/11] updated path and file names for multi-band matching --- notebooks/create_shear_mb_empty.py | 4 ++-- notebooks/demo_add_bands_to_empty.py | 26 +++++--------------------- requirements.txt | 2 +- sp_validation/run_joint_cat.py | 9 ++++++--- 4 files changed, 14 insertions(+), 27 deletions(-) diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py index dea4ba5..17ce54a 100644 --- a/notebooks/create_shear_mb_empty.py +++ b/notebooks/create_shear_mb_empty.py @@ -47,7 +47,7 @@ obj._params["verbose"] = True # + -path_bands = "../UNIONS5000" +path_bands = "./UNIONS5000" subdir_base = "UNIONS." path_base = subdir_base @@ -145,7 +145,7 @@ def strip_h5py_metadata_dtype(dat_dtype, dat_ext_dtype): dset_comb[name] = dat[name] # Fill new fields with default value (-199) - for name, _ in dtype_keys: + for name in dtype_keys.names: dset_comb[name] = -199 #new_empty = np.full(n_rows, -199, dtype=dtype_keys) diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py index b07e993..d687df4 100644 --- a/notebooks/demo_add_bands_to_empty.py +++ b/notebooks/demo_add_bands_to_empty.py @@ -40,7 +40,7 @@ # Set parameters base = "unions_shapepipe_comprehensive_empty_ugriz" year = 2024 -ver = "v1.5.c.P37" +ver = "v1.5.c" obj._params = {} @@ -49,10 +49,9 @@ obj._params["verbose"] = True # + -path_bands = "../UNIONS5000" -subdir_base = "UNIONS." +path_bands = "./UNIONS5000" -path_base = subdir_base +path_base = "UNIONS." path_suff = "_SP_ugriz_photoz_ext.cat" # NUMBER key in photo-z catalogue @@ -76,7 +75,7 @@ # - # Read catalogue -dat = obj.read_cat(load_into_memory=False, mode="r") +dat = obj.read_cat(load_into_memory=False, mode="r", name="dat_comb") # + # Get tile IDs @@ -92,28 +91,13 @@ # + dist_sqr = {} do_dist_check = False -do_copy = False # Loop over tile IDs for idx, tile_ID in tqdm.tqdm(enumerate(tile_IDs), total=len(tile_IDs), disable=False): #print(idx/len(tile_ID), tile_ID) - src = os.path.join(path_bands, f"{path_base}{tile_ID}", f"{path_base}{tile_ID}{path_suff}") - dst = os.path.join(f".", f"{path_base}{tile_ID}{path_suff}") - - if do_copy: - if not os.path.exists(src): - print(" Copy FITS file:", src, end=" ") - start = timer() - copyfile(src, dst) - end = timer() - print(f" {end - start:.1f}s") - else: - print(" FITS file already exists:", src) - path = dst - else: - path = src + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") #print(" Read data from file:", path, end=" ") #start = timer() diff --git a/requirements.txt b/requirements.txt index 17d7dad..5d49b5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ importlib_metadata adjustText colorama emcee -cs_util==0.1.5 +cs_util==0.1.3 astropy>=5.0 healpy healsparse diff --git a/sp_validation/run_joint_cat.py b/sp_validation/run_joint_cat.py index e8b565f..47ca8a9 100644 --- a/sp_validation/run_joint_cat.py +++ b/sp_validation/run_joint_cat.py @@ -41,7 +41,8 @@ class BaseCat(object): """ def __init__(self): - self.params_default() + #self.params_default() + pass def set_params_from_command_line(self, args): """Set Params From Command Line. @@ -77,7 +78,7 @@ def read_config_set_params(self, fpath): return config - def read_cat(self, load_into_memory=False, mode="r", hdu=1): + def read_cat(self, load_into_memory=False, mode="r", hdu=1, name="data"): """Read Cat. Read input catalogue, either FITS or HDF5. @@ -91,6 +92,8 @@ def read_cat(self, load_into_memory=False, mode="r", hdu=1): HDF5 read mode, default is "r" hdu: int, optional HDU number (for FITS file); default is 1 + name: str, optional + dataset name, default is 'data' Returns ------- @@ -120,7 +123,7 @@ def read_cat(self, load_into_memory=False, mode="r", hdu=1): self._hd5file = h5py.File(fpath, mode) try: - dat = self._hd5file["data"] + dat = self._hd5file[name] except: print(f"Error while reading file {fpath}") raise From 0894fd510f0299507755d508784b9e481c0251af Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 1 Jul 2025 13:53:08 +0200 Subject: [PATCH 09/11] multi-band addition: report missing mb files --- .gitignore | 2 ++ notebooks/demo_add_bands_to_empty.py | 40 ++++++++++++---------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index ac15da6..2427dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -170,3 +170,5 @@ notebooks/leakage_minimal.ipynb notebooks/mask_fits2hsparse.ipynb notebooks/mask_fits2hsparse_test.ipynb notebooks/create_shear_mb_empty.ipynb +notebooks/demo_add_bands.ipynb +notebooks/demo_add_bands_to_empty.ipynb diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py index d687df4..aa4e0ab 100644 --- a/notebooks/demo_add_bands_to_empty.py +++ b/notebooks/demo_add_bands_to_empty.py @@ -86,7 +86,20 @@ tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] # - -from shutil import copyfile +missing_IDs = [] +print("Check for missing mb catalogue files...") +for tile_ID in tqdm.tqdm(tile_IDs, total=len(tile_IDs)): + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + if not os.path.exists(path): + missing_IDs.append(tile_ID) + +n_missing = len(missing_IDs) +print(f"{n_missing} missing multi-band catalogue files, saving to file") +if n_missing > 0: + with open("missing_IDs.txt", "w") as f: + for tile_ID in missing_IDs: + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + print(f"{path_base}{tile_ID}{path_suff}", file=f) # + dist_sqr = {} @@ -95,49 +108,30 @@ # Loop over tile IDs for idx, tile_ID in tqdm.tqdm(enumerate(tile_IDs), total=len(tile_IDs), disable=False): - #print(idx/len(tile_ID), tile_ID) - path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + + if not os.path.exists(path): + continue - #print(" Read data from file:", path, end=" ") - #start = timer() hdu_list = fits.open(path) dat_mb = hdu_list[hdu_no].data - #end = timer() - #print(f" {end - start:.1f}s") - #print(" Get numbers", end=" ") - #start = timer() numbers = dat_mb[key_num] - #end = timer() - #print(f" {end - start:.1f}s") - #print(" Identify matches", end= " ") - #start = timer() # Select indices in dat with current tile ID w = dat["TILE_ID"] == tile_IDs_raw_list[idx] indices = np.where(w)[0] - #end = timer() - #print(f" {end - start:.1f}s") # Compute coordinate distances as matching check if do_dist_check: - #print(" Compute distance check", end=" ") - #start = timer() dist_sqr[TILE_ID] = sum( (dat[indices]["RA"] - dat_mb["ALPHA_J2000"]) ** 2 + (dat[indices]["Dec"] - dat_mb["DELTA_J2000"]) ** 2 ) / len(dat_mb) - #end = timer() - #print(f" {end - start:.1f}s") - #print( " Copy mb data to combined array", end=" ") - #start = timer() # Copy multi-band values to combined array for key in keys: dat[indices][key] = dat_mb[key] - #end = timer() - #print(f" {end - start:.1f}s") hdu_list.close() # - From b846ff99028c1749176efb79b2e7c0bae02d71d9 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Wed, 9 Jul 2025 07:49:42 +0200 Subject: [PATCH 10/11] python -> 3.11; add mb bands improvements --- environment.yml | 2 +- notebooks/demo_add_bands_to_empty.py | 162 ++++++++++++++++++--------- 2 files changed, 110 insertions(+), 54 deletions(-) diff --git a/environment.yml b/environment.yml index f4c6ed3..7cf9f42 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - lmfit - - python==3.9 + - python==3.11.11 - pip>=21 - pyccl==2.5.1 - treecorr diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py index aa4e0ab..0a65ce7 100644 --- a/notebooks/demo_add_bands_to_empty.py +++ b/notebooks/demo_add_bands_to_empty.py @@ -23,6 +23,8 @@ import os import numpy as np import numpy.lib.recfunctions as rfn +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, as_completed from timeit import default_timer as timer import tqdm @@ -30,6 +32,7 @@ from astropy.io import fits from sp_validation import run_joint_cat as sp_joint + # - # Create instance of object @@ -54,24 +57,32 @@ path_base = "UNIONS." path_suff = "_SP_ugriz_photoz_ext.cat" -# NUMBER key in photo-z catalogue -key_num = "SeqNr" keys_mag = [f"MAG_GAAP_0p7_{band}" for band in ("u", "g", "r", "i", "z", "z2")] -keys = ["Z_B", "Z_B_MIN", "Z_B_MAX", "T_B"] + keys_mag +keys = [ + "Z_B", + "Z_B_MIN", + "Z_B_MAX", + "T_B", +] + keys_mag hdu_no = 1 + +do_missing_check = False + +parallel = False +n_cpu = 48 # - # ## Run # + # Check parameter validity -#obj.check_params() +# obj.check_params() # Update parameters (here: strings to list) -#obj.update_params() +# obj.update_params() # - # Read catalogue @@ -86,57 +97,102 @@ tile_IDs = [f"{float(tile_ID):07.3f}" for tile_ID in tile_IDs_raw_list] # - -missing_IDs = [] -print("Check for missing mb catalogue files...") -for tile_ID in tqdm.tqdm(tile_IDs, total=len(tile_IDs)): - path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") - if not os.path.exists(path): - missing_IDs.append(tile_ID) - -n_missing = len(missing_IDs) -print(f"{n_missing} missing multi-band catalogue files, saving to file") -if n_missing > 0: - with open("missing_IDs.txt", "w") as f: - for tile_ID in missing_IDs: - path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") - print(f"{path_base}{tile_ID}{path_suff}", file=f) +if do_missing_check: + missing_IDs = [] + print("Check for missing mb catalogue files...") + for tile_ID in tqdm.tqdm(tile_IDs, total=len(tile_IDs)): + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + if not os.path.exists(path): + missing_IDs.append(tile_ID) -# + -dist_sqr = {} -do_dist_check = False - -# Loop over tile IDs -for idx, tile_ID in tqdm.tqdm(enumerate(tile_IDs), total=len(tile_IDs), disable=False): - - path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") - - if not os.path.exists(path): - continue - - hdu_list = fits.open(path) - dat_mb = hdu_list[hdu_no].data - - numbers = dat_mb[key_num] - - # Select indices in dat with current tile ID - w = dat["TILE_ID"] == tile_IDs_raw_list[idx] - indices = np.where(w)[0] - - # Compute coordinate distances as matching check - if do_dist_check: - dist_sqr[TILE_ID] = sum( - (dat[indices]["RA"] - dat_mb["ALPHA_J2000"]) ** 2 - + (dat[indices]["Dec"] - dat_mb["DELTA_J2000"]) ** 2 - ) / len(dat_mb) - - # Copy multi-band values to combined array - for key in keys: - dat[indices][key] = dat_mb[key] - - hdu_list.close() -# - + n_missing = len(missing_IDs) + print(f"{n_missing} missing multi-band catalogue files, saving to file") + if n_missing > 0: + with open("missing_IDs.txt", "w") as f: + for tile_ID in missing_IDs: + path = os.path.join( + path_bands, f"{path_base}{tile_ID}{path_suff}" + ) + print(f"{path_base}{tile_ID}{path_suff}", file=f) + + +# Build TILE_ID -> list of indices mapping once +tile_to_indices = defaultdict(list) +for i, tile_id in enumerate(dat["TILE_ID"]): + tile_to_indices[tile_id].append(i) + +# Optionally convert lists to arrays +for k in tile_to_indices: + tile_to_indices[k] = np.array(tile_to_indices[k]) +def fill_one_tile(path, keys): + + try: + with fits.open(path, memmap=True) as hdu_list: + dat_mb = hdu_list[hdu_no].data + result = {key: dat_mb[key] for key in keys} + + return result + + except Exception as e: + print(f"Error processing {path}: {e}") + return None + + +if parallel: + + # Parallel processing + with ProcessPoolExecutor(max_workers=n_cpu) as executor: + futures = {} + for idx, tile_ID in enumerate(tile_IDs): + + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + if not os.path.exists(path): + continue + + tile_key = tile_IDs_raw_list[idx] + indices = tile_to_indices.get(tile_key, None) + if indices is None: + continue + + futures[ + executor.submit( + fill_one_tile, + path, + keys, + ) + ] = (tile_key, indices) + + # As results complete, update dat + for future in tqdm.tqdm(as_completed(futures), total=len(futures)): + tile_key, indices = futures[future] + result = future.result() + if result is None: + continue + + for key in keys: + dat[indices][key] = result[key] + +else: + + # Loop over tile IDs + for idx, tile_ID in tqdm.tqdm( + enumerate(tile_IDs), total=len(tile_IDs), disable=False + ): + + path = os.path.join(path_bands, f"{path_base}{tile_ID}{path_suff}") + if not os.path.exists(path): + continue + + indices = tile_to_indices.get(tile_IDs_raw_list[idx], None) + if indices is None: + continue + + result = fill_one_tile(path, keys) + for key in keys: + dat[indices][key] = result[key] + obj.write_hdf5_file(dat) # Close input HDF5 catalogue file From 29bd8b76661dd2023c87037a0096e1ec9bb489c3 Mon Sep 17 00:00:00 2001 From: martinkilbinger Date: Tue, 9 Sep 2025 14:29:27 +0000 Subject: [PATCH 11/11] science_portal_changes --- notebooks/create_shear_mb_empty.py | 2 +- notebooks/demo_add_bands_to_empty.py | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/notebooks/create_shear_mb_empty.py b/notebooks/create_shear_mb_empty.py index dea4ba5..2fdbe3c 100644 --- a/notebooks/create_shear_mb_empty.py +++ b/notebooks/create_shear_mb_empty.py @@ -37,7 +37,7 @@ # + # Set parameters -base = "unions_shapepipe_comprehensive" +base = "unions_shapepipe_comprehensive_struc" year = 2024 ver = "v1.5.c" diff --git a/notebooks/demo_add_bands_to_empty.py b/notebooks/demo_add_bands_to_empty.py index b07e993..28a081f 100644 --- a/notebooks/demo_add_bands_to_empty.py +++ b/notebooks/demo_add_bands_to_empty.py @@ -7,7 +7,7 @@ # format_version: '1.5' # jupytext_version: 1.15.1 # kernelspec: -# display_name: Python 3 +# display_name: Python 3 (ipykernel) # language: python # name: python3 # --- @@ -38,9 +38,9 @@ # + # Set parameters -base = "unions_shapepipe_comprehensive_empty_ugriz" +base = "unions_shapepipe_comprehensive_struc_empty_ugriz" year = 2024 -ver = "v1.5.c.P37" +ver = "v1.5.c" obj._params = {} @@ -49,7 +49,7 @@ obj._params["verbose"] = True # + -path_bands = "../UNIONS5000" +path_bands = "./UNIONS5000" subdir_base = "UNIONS." path_base = subdir_base @@ -58,9 +58,23 @@ # NUMBER key in photo-z catalogue key_num = "SeqNr" -keys_mag = [f"MAG_GAAP_0p7_{band}" for band in ("u", "g", "r", "i", "z", "z2")] - -keys = ["Z_B", "Z_B_MIN", "Z_B_MAX", "T_B"] + keys_mag +bands = ("u", "g", "r", "i", "z", "z2") +base_keys = ["MAGERR_GAAP", "FLUX_GAAP", "FLUXERR_GAAP", "FLAG_GAAP", "MAG_LIM"] +keys_mag = [f"MAG_GAAP_0p7_{band}" for band in bands] +for base_key in base_keys: + keys_mag.extend([f"_{base_key}_{band}" for band in bands]) + +keys = [ + "EXTINCTION", + "MP_NAME", + "Z_B", + "Z_B_MIN", + "Z_B_MAX" + "T_B", + "ODDS", +] + keys_mag + +print(f"Using {len(keys)} columns from PhotoPipe") hdu_no = 1 # -