diff --git a/.dockerignore b/.dockerignore
index 7797741..96a401f 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,5 @@
+**/tmp*
+test_data
.ruff_cache
.tox
*.egg-info
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3907571
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+**/benchmark_results
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
index 38c7b77..4767aa9 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
@@ -61,7 +61,7 @@
Superpixel parameters
gensuperpixels
- generate-superpxiels
+ generate-superpixels
If an image does not have an annotation with superpixels, generate one
true
@@ -100,6 +100,13 @@
true
+
+ useCuda
+ usecuda
+ Whether or not to use GPU/cuda (true) or cpu (false).
+
+ false
+
batchSize
batchsize
@@ -198,5 +205,12 @@
4
The number of worker threads for superpixel and feature generation
+
+ cutoff
+ cutoff
+
+ 500
+ Number of unannotated superpixels to use per slide for features, training and predictions
+
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
index cd82ded..a9d1353 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
@@ -204,7 +204,7 @@ def progCallback(step, count, total):
print('Create superpixels for %s' % item['name'])
imagePath = os.path.join(tempdir, item['name'])
gc.downloadFile(item['largeImage']['fileId'], imagePath)
- outImagePath = os.path.join(tempdir, 'superpixel.tiff')
+ outImagePath = os.path.join(tempdir, '%s.pixelmap.tiff' % item['name'])
outAnnotationPath = os.path.join(tempdir, '%s.anot' % annotationName)
if True:
@@ -332,7 +332,7 @@ def createFeatureListFromPatchAndMaskList(self, patch_list, mask_list, maskvals_
)
return feature_list
- def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patchSize, prog):
+ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patchSize, prog, cutoff):
import large_image
print('Create feature', fileName)
@@ -349,17 +349,35 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch
gc.downloadFile(maskItem['largeImage']['fileId'], maskPath)
tsMask = large_image.open(maskPath)
+ num_values = len(elem['values'])
+ labeled_samples = set([i for i, x in enumerate(elem['values']) if x > 0])
+ # background is used if we have a bounding box of 1 pixel in top left corner that is unlabeled. We do not want to extract features for that
+ has_background = elem['user']['bbox'][:4] == [0,0,1,1]
+ start_index = 1 if has_background else 0
+ unlabeled_samples = [i for i, x in enumerate(elem['values'][start_index:], start=start_index) if x == 0]
+
+ if num_values - len(labeled_samples) > cutoff:
+ # only select a subset of unlabeled samples, i.e., prune the feature list
+ random.shuffle(unlabeled_samples)
+ unlabeled_samples = unlabeled_samples[:cutoff]
+ indices = list(sorted(list(labeled_samples) + unlabeled_samples))
+
with h5py.File(filePath, 'w') as fptr:
batch_size = 1024 # TODO: Is this the best value?
- for batch_start in range(0, len(elem['values']), batch_size):
- batch_list = elem['values'][batch_start: batch_start + batch_size]
+ total_size = len(indices)
+ for batch_start in range(0, total_size, batch_size):
+ #batch_list = elem['values'][batch_start: batch_start + batch_size]
+ batch_list = indices[batch_start: batch_start + batch_size]
patch_list = []
mask_list = []
maskvals_list = []
- for idx, _ in enumerate(batch_list, start=batch_start):
- prog.item_progress(item, 0.9 * idx / len(elem['values']))
- bbox = elem['user']['bbox'][idx * 4: idx * 4 + 4]
+
+ for idx, i in enumerate(batch_list, start=batch_start):
+ prog.item_progress(item, 0.9 * idx / total_size)
+ bbox = elem['user']['bbox'][i * 4: i * 4 + 4]
# use masked superpixel
+ if len(bbox) < 4:
+ pass
patch = ts.getRegion(
region=dict(
left=int(bbox[0]), top=int(bbox[1]),
@@ -384,7 +402,7 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch
if mask.shape[2] == 4:
mask = mask[:, :, :-1]
maskvals = [[val % 256, val // 256 % 256, val // 65536 % 256]
- for val in [idx * 2, idx * 2 + 1]]
+ for val in [(i + 1) * 2, (i + 1) * 2 + 1]]
patch_list.append(patch)
mask_list.append(mask)
maskvals_list.append(maskvals)
@@ -409,6 +427,8 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch
(time.time() - starttime)),
item['name'])
del batch_list, patch_list, mask_list, maskvals_list, feature_list
+ used_indices_ds = fptr.create_dataset(
+ 'used_indices', data=np.array(indices), dtype='i')
print(ds.shape, len(elem['values']), '%5.3f' % (time.time() - starttime),
item['name'])
prog.item_progress(item, 0.9)
@@ -418,30 +438,38 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch
prog.item_progress(item, 1)
return file
- def createFeatures(self, gc, folderId, annotationName, featureFolderId, patchSize, numWorkers,
- prog):
- itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
+ def createFeatures(self, gc, folderId, annotationName, itemsAndAnnot, featureFolderId, patchSize, numWorkers,
+ prog, cutoff):
+ # itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
prog.message('Creating features')
prog.progress(0)
prog.items([item for item, _, _ in itemsAndAnnot])
results = {}
futures = []
+ featureFiles = [
+ f for item in gc.listItem(featureFolderId) for f in gc.listFile(item['_id'])
+ ]
with concurrent.futures.ThreadPoolExecutor(max_workers=numWorkers) as executor:
for item, _, elem in itemsAndAnnot:
- bbox = elem['user']['bbox']
- hashval = repr(dict(
- itemId=item['_id'], bbox=[int(v) for v in bbox], patchSize=patchSize))
- hashval = hashlib.new('sha256', hashval.encode()).hexdigest()
- fileName = 'feature-%s.h5' % (hashval)
- found = False
- for existing in gc.listItem(featureFolderId, name=fileName):
- results[item['_id']] = next(gc.listFile(existing['_id'], limit=1))
- found = True
- break
- if not found:
- futures.append((item, executor.submit(
- self.createFeaturesForItem, gc, item, elem, featureFolderId, fileName,
- patchSize, prog)))
+ match = [
+ f for f in featureFiles if
+ re.match('^%s.*[.]feature.h5$' % re.escape(item['name']), f['name'])
+ ]
+ if len(match):
+ results[item['_id']] = match[0]
+ else: # fallback to hash-based naming - generate features if necessary
+ bbox = elem['user']['bbox']
+ hashval = repr(dict(
+ itemId=item['_id'], bbox=[int(v) for v in bbox], patchSize=patchSize))
+ hashval = hashlib.new('sha256', hashval.encode()).hexdigest()
+ fileName = 'feature-%s.h5' % (hashval)
+ match = [f for f in featureFiles if f['name'] == fileName]
+ if len(match):
+ results[item['_id']] = match[0]
+ else:
+ futures.append((item, executor.submit(
+ self.createFeaturesForItem, gc, item, elem, featureFolderId,
+ '%s.feature.h5' % (item['name']), patchSize, prog, cutoff)))
for item, future in futures:
file = future.result()
try:
@@ -461,12 +489,20 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature,
item['name'], annotrec['annotation']['name'], annotrec['_id'], annotrec['_version']))
featurePath = os.path.join(record['tempdir'], feature['name'])
gc.downloadFile(feature['_id'], featurePath)
+ print(f"Downloaded '{feature['_id']}' to '{featurePath}'")
with h5py.File(featurePath, 'r') as ffptr:
fds = ffptr['images']
- for idx, labelnum in enumerate(elem['values']):
- if labelnum and labelnum < len(elem['categories']):
+ if 'used_indices' in ffptr:
+ indices = ffptr['used_indices']
+ else:
+ indices = range(len(elem['values']))
+ skipped_excluded = 0
+ for i,idx in enumerate(indices):
+ labelnum = elem['values'][idx]
+ if 0 < labelnum < len(elem['categories']):
labelname = elem['categories'][labelnum]['label']
if labelname in excludeLabelList:
+ skipped_excluded += 1
continue
if labelname not in record['groups']:
record['groups'][labelname] = elem['categories'][labelnum]
@@ -475,7 +511,7 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature,
labelname = labelList[labelnum - 1]
else:
continue
- patch = fds[idx]
+ patch = fds[i]
if not record['ds']:
record['ds'] = record['fptr'].create_dataset(
'images', (1,) + patch.shape, maxshape=(None,) + patch.shape,
@@ -494,11 +530,11 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature,
record['lastlog'] = time.time()
print(record['ds'].shape, record['counts'],
'%5.3f' % (time.time() - record['starttime']))
+ print(f"Skipped {skipped_excluded} samples with labels that were excluded")
- def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
+ def trainModel(self, gc, annotationName, itemsAndAnnot, features, modelFolderId,
batchSize, epochs, trainingSplit, randomInput, labelList,
- excludeLabelList, prog):
- itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
+ excludeLabelList, use_cuda, prog):
with tempfile.TemporaryDirectory(dir=os.getcwd()) as tempdir:
trainingPath = os.path.join(tempdir, 'training.h5')
with h5py.File(trainingPath, 'w') as fptr:
@@ -526,7 +562,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
prog.progress(1)
if not record['ds']:
print('No labeled data')
- return
+ return None, None
record['labelds'] = fptr.create_dataset(
'labels', (len(record['labelvals']),), dtype=int)
record['labelds'] = np.array(record['labelvals'], dtype=int)
@@ -536,7 +572,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
prog.progress(0)
history, modelPath = self.trainModelDetails(
record, annotationName, batchSize, epochs, itemsAndAnnot, prog, tempdir,
- trainingSplit)
+ trainingSplit, use_cuda)
modTrainingPath = os.path.join(tempdir, '%s ModTraining Epoch %d.h5' % (
annotationName, self.getCurrentEpoch(itemsAndAnnot)))
@@ -551,16 +587,16 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
for attempt in tenacity.Retrying(stop=tenacity.stop_after_attempt(self.uploadRetries)):
with attempt:
modelFile = gc.uploadFileToFolder(modelFolderId, modelPath)
- print('Saved model')
+ print(f'Saved model to {modelFolderId}')
for attempt in tenacity.Retrying(stop=tenacity.stop_after_attempt(self.uploadRetries)):
with attempt:
modTrainingFile = gc.uploadFileToFolder(modelFolderId, modTrainingPath)
- print('Saved modTraining')
+ print(f'Saved modTraining to {modelFolderId}')
return modelFile, modTrainingFile
- def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, model, item,
+ def predictLabelsForItem(self, gc, annotationName, tempdir, model, item,
annotrec, elem, feature, curEpoch, userId, labels, groups,
- makeHeatmaps, radius, magnification, certainty, batchSize, prog):
+ makeHeatmaps, radius, magnification, certainty, batchSize, use_cuda, prog):
import al_bench.factory
print('Predicting %s' % (item['name']))
@@ -571,6 +607,8 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir,
# Figure out which samples are already labeled
labeled_samples: NDArray[np.int_] = np.nonzero(np.array(elem['values']))
+ number_annotations = len(elem['values'])
+ tiny = np.finfo(np.float32).tiny
print(f'{labeled_samples = }')
print(f'certainty_type = {certainty!r}')
@@ -581,9 +619,17 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir,
# In case we are computing batchbald
compCertainty.set_batchbald_num_samples(16)
compCertainty.set_batchbald_batch_size(100)
- compCertainty.set_batchbald_excluded_samples(labeled_samples)
+ #compCertainty.set_batchbald_excluded_samples(labeled_samples)
with h5py.File(featurePath, 'r') as ffptr:
+ if 'used_indices' in ffptr:
+ used_indices = set(list(ffptr['used_indices']))
+ else:
+ used_indices = set(range(number_annotations))
+ all_indices = set(range(number_annotations))
+ unused_indices = list(sorted(all_indices.difference(used_indices)))
+ compCertainty.set_batchbald_excluded_samples(np.array(unused_indices))
+
prog.item_progress(item, 0)
# Create predicted annotation
annot = copy.deepcopy(annotrec)
@@ -592,21 +638,29 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir,
annot['elements'][0]['categories'] = [groups[key] for key in labels]
ds = ffptr['images']
prog.item_progress(item, 0.05)
- catWeights, predictions = self.predictLabelsForItemDetails(
- batchSize, ds, item, model, prog)
- catWeights = np.array(catWeights)
- predictions = np.array(predictions)
+ _catWeights, _predictions, indices = self.predictLabelsForItemDetails(
+ batchSize, ds, np.array(list(used_indices), dtype=np.int64), item, model, use_cuda, prog)
+ # expand catWeights and predictions to be length of elem['values'] instead of just `cutoff` samples
+ # then copy in results from predictions
+ catWeights = np.zeros((number_annotations,) + _catWeights.shape[1:], dtype=np.float32 if str(_catWeights.dtype).endswith("32") else np.float64)
+ predictions = np.zeros((number_annotations,) + _predictions.shape[1:], dtype=np.float32 if str(_predictions.dtype).endswith("32") else np.float64)
+ for cw,p,idx in zip(_catWeights, _predictions, indices):
+ catWeights[idx] = cw
+ predictions[idx] = p
+
print_fully('predictions', predictions)
prog.item_progress(item, 0.7)
# compCertainty needs catWeights to have shape (num_superpixels,
# bayesian_samples, num_classes) if 'batchbald' is selected, otherwise the
# shape should be (num_superpixels, num_classes).
- print_fully('catWeights', catWeights)
# Ask compCertainty to compute certainties
- cert = compCertainty.from_numpy_array(catWeights)
+ cert = compCertainty.from_numpy_array(catWeights + tiny)
+ print_fully('catWeights', catWeights)
+
# After the call to compCertainty, those numbers that end up as values for
# annot's keys 'values', 'confidence', 'categoryConfidence', and 'certainty'
# should have shape (num_superpixels, num_classes).
+
print_fully('cert', cert)
scores = cert[certainty]['scores']
print_fully('scores', scores)
@@ -617,14 +671,28 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir,
epsilon = 1e-50
predictions = np.log(catWeights + epsilon)
cats = np.argmax(catWeights, axis=-1)
- indices = np.arange(cats.shape[0])
- conf = catWeights[indices, cats[indices]]
+ # 0 means we didn't make a prediction, so increment by one
+ #cats[indices] += 1
+ conf = catWeights[list(all_indices), cats[np.arange(cats.shape[0])]]
print_fully('cats', cats)
print_fully('conf', conf)
+ # give unused_indices the highest possible confidence so that they show up last in the active learning UI
+ # (because it sorts by confidence in descending order)
+ scores[unused_indices] = np.finfo(scores.dtype).max
+ # additionally, ensure that labels that are already labeled also end up last or late in the recommendations
+ # for the DSA UI, this prevents labeled samples from being shown again to the user
+ scores[labeled_samples] = np.finfo(scores.dtype).max
+
+ # additionally, ensure that labels that are already labeled also end up last or late in the recommendations
+ # for the DSA UI, this prevents labeled samples from being shown again to the user
+ scores[labeled_samples] = np.finfo(scores.dtype).max
+
cats = cats.tolist()
conf = conf.tolist()
- # Should this be from predictions for from catWeights?!!!
+
+ # Should this be from predictions or from catWeights?!!!
+ predictions[np.isneginf(predictions)] = np.finfo(predictions.dtype).min
catConf = predictions.tolist()
scores = scores.tolist()
annot['elements'][0]['values'] = cats
@@ -761,10 +829,10 @@ def makeHeatmapsForItem(self, gc, annotationName, userId, tempdir, radius, item,
'fileId': item['largeImage']['fileId'],
'userId': userId}))
- def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
+ def predictLabels(self, gc, folderId, annotationName, itemsAndAnnot, features, modelFolderId,
annotationFolderId, saliencyMaps, radius, magnification,
- certainty, batchSize, prog):
- itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
+ certainty, batchSize, use_cuda, prog):
+ #itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
curEpoch = self.getCurrentEpoch(itemsAndAnnot)
folder = gc.getFolder(folderId)
userId = folder['creatorId']
@@ -779,7 +847,7 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
modelFile = next(gc.listFile(item['_id'], limit=1))
break
if not modelFile:
- print('No model file found')
+ print(f'No model file found in {modelFolderId}')
return
print(modelFile['name'], item)
modelPath = os.path.join(tempdir, modelFile['name'])
@@ -792,7 +860,7 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
modTrainingFile = next(gc.listFile(item['_id'], limit=1))
break
if not modTrainingFile:
- print('No modTraining file found')
+ print(f'No modTraining file found in {modelFolderId}')
return
print(modTrainingFile['name'], item)
modTrainingPath = os.path.join(tempdir, modTrainingFile['name'])
@@ -823,20 +891,26 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
if item['_id'] not in features:
continue
self.predictLabelsForItem(
- gc, annotationName, annotationFolderId, tempdir, model, item, annotrec, elem,
+ gc, annotationName, tempdir, model, item, annotrec, elem,
features.get(item['_id']), curEpoch, userId, labels, groups, saliencyMaps,
- radius, magnification, certainty, batchSize, prog)
+ radius, magnification, certainty, batchSize, use_cuda, prog)
prog.progress(1)
- def main(self, args):
+ def main(self, args, gc = None):
self.feature_is_image = args.feature != 'vector'
self.certainty = args.certainty
print('\n>> CLI Parameters ...\n')
pprint.pprint(vars(args))
- gc = girder_client.GirderClient(apiUrl=args.girderApiUrl)
- gc.token = args.girderToken
+ if gc is None:
+ gc = girder_client.GirderClient(apiUrl=args.girderApiUrl)
+ gc.token = args.girderToken
+ gc.authenticate('admin', 'password')
+
+ # check to make sure we have access to server
+ if not [x for x in list(gc.listCollection()) if x['name'] == 'Active Learning']:
+ raise Exception("Unable to authenticate with girder")
with ProgressHelper(
'Superpixel Classification', 'Superpixel classification', args.progress) as prog:
@@ -845,16 +919,24 @@ def main(self, args):
gc, args.images, args.annotationName, args.radius, args.magnification,
args.annotationDir, args.numWorkers, prog)
+ itemsAndAnnot = self.getItemsAndAnnotations(gc, args.images, args.annotationName)
+ print("Creating features...")
features = self.createFeatures(
- gc, args.images, args.annotationName, args.features, args.patchSize,
- args.numWorkers, prog)
+ gc, args.images, args.annotationName, itemsAndAnnot, args.features, args.patchSize,
+ args.numWorkers, prog, args.cutoff)
+ print("Done creating features...")
if args.train:
+ print("Training...")
self.trainModel(
- gc, args.images, args.annotationName, features, args.modeldir, args.batchSize,
- args.epochs, args.split, args.randominput, args.labels, args.exclude, prog)
+ gc, args.annotationName, itemsAndAnnot, features, args.modeldir, args.batchSize,
+ args.epochs, args.split, args.randominput, args.labels, args.exclude, args.useCuda, prog)
+ print("Done training...")
+ print("Predicting labels...")
self.predictLabels(
- gc, args.images, args.annotationName, features, args.modeldir, args.annotationDir,
- args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize,
+ gc, args.images, args.annotationName, itemsAndAnnot, features, args.modeldir, args.annotationDir,
+ args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize, args.useCuda,
prog)
+ print("Done predicting labels...")
+ print("Done, exiting")
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
index 0af02d8..e50cd8a 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
@@ -3,6 +3,7 @@
from typing import Optional
import h5py
+import numpy as np
import tensorflow as tf
from SuperpixelClassificationBase import SuperpixelClassificationBase
@@ -35,33 +36,56 @@ class SuperpixelClassificationTensorflow(SuperpixelClassificationBase):
def __init__(self):
self.training_optimal_batchsize: Optional[int] = None
self.prediction_optimal_batchsize: Optional[int] = None
+ self.use_cuda = False
def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndAnnot, prog,
- tempdir, trainingSplit):
- # print(f'Tensorflow trainModelDetails(batchSize={batchSize}, ...)')
- # make model
- num_classes = len(record['labels'])
- model = tf.keras.Sequential([
- tf.keras.layers.Rescaling(1.0 / 255),
- tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
- tf.keras.layers.MaxPooling2D(),
- tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
- tf.keras.layers.MaxPooling2D(),
- tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
- tf.keras.layers.MaxPooling2D(),
- tf.keras.layers.Flatten(),
- # tf.keras.layers.Dropout(0.2),
- tf.keras.layers.Dense(128, activation='relu'),
- tf.keras.layers.Dense(num_classes)])
- prog.progress(0.2)
- model.compile(optimizer='adam',
- loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
- metrics=['accuracy'])
+ tempdir, trainingSplit, use_cuda):
+ self.use_cuda = use_cuda
+
+ # Enable GPU memory growth globally to avoid precondition errors
+ gpus = tf.config.list_physical_devices('GPU')
+ if gpus and self.use_cuda:
+ try:
+ for gpu in gpus:
+ tf.config.experimental.set_memory_growth(gpu, True)
+ except RuntimeError as e:
+ print(f"Could not set memory growth: {e}")
+ if not self.use_cuda:
+ tf.config.set_visible_devices([], 'GPU')
+ device = "gpu" if use_cuda else "cpu"
+ print(f"Using device: {device}")
+
+ # Dataset preparation (outside strategy scope)
+ ds_h5 = record['ds']
+ labelds_h5 = record['labelds']
+ # Fully load to memory and break h5py reference
+ ds_numpy = np.array(ds_h5[:])
+ labelds_numpy = np.array(labelds_h5[:])
+
+ strategy = tf.distribute.MirroredStrategy()
+ with strategy.scope():
+ num_classes = len(record['labels'])
+ model = tf.keras.Sequential([
+ tf.keras.layers.Rescaling(1.0 / 255),
+ tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
+ tf.keras.layers.MaxPooling2D(),
+ tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
+ tf.keras.layers.MaxPooling2D(),
+ tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
+ tf.keras.layers.MaxPooling2D(),
+ tf.keras.layers.Flatten(),
+ tf.keras.layers.Dense(128, activation='relu'),
+ tf.keras.layers.Dense(num_classes)])
+ prog.progress(0.2)
+ model.compile(optimizer='adam',
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+ metrics=['accuracy'])
+
prog.progress(0.7)
- # generate split
- full_ds = tf.data.Dataset.from_tensor_slices((record['ds'], record['labelds']))
- full_ds = full_ds.shuffle(1000) # add seed=123 ?
- count = len(full_ds)
+ # generate split using numpy arrays
+ full_ds = tf.data.Dataset.from_tensor_slices((ds_numpy, labelds_numpy))
+ full_ds = full_ds.shuffle(1000)
+ count = len(ds_numpy)
train_size = int(count * trainingSplit)
if batchSize < 1:
batchSize = self.findOptimalBatchSize(model, full_ds, training=True)
@@ -85,24 +109,53 @@ def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndA
self.saveModel(model, modelPath)
return history, modelPath
+ def _get_device(self, use_cuda):
+ if tf.config.list_physical_devices('GPU') and use_cuda:
+ return '/GPU:0'
+ return '/CPU:0'
+
def predictLabelsForItemDetails(
- self, batchSize, ds: h5py._hl.dataset.Dataset, item, model, prog,
+ self, batchSize, ds: h5py._hl.dataset.Dataset, indices, item, model, use_cuda, prog,
):
- # print(f'Tensorflow predictLabelsForItemDetails(batchSize={batchSize}, ...)')
if batchSize < 1:
batchSize = self.findOptimalBatchSize(
model, tf.data.Dataset.from_tensor_slices(ds), training=False,
)
print(f'Optimal batch size for prediction = {batchSize}')
- predictions = model.predict(
- ds,
- batch_size=batchSize,
- callbacks=[_LogTensorflowProgress(
- prog, (ds.shape[0] + batchSize - 1) // batchSize, 0.05, 0.35, item)])
- prog.item_progress(item, 0.4)
- # softmax to scale to 0 to 1
- catWeights = tf.nn.softmax(predictions)
- return catWeights, predictions
+
+ device = self._get_device(use_cuda)
+ with tf.device(device):
+ # Create a dataset that pairs the data with their indices
+ dataset = tf.data.Dataset.from_tensor_slices((ds, indices))
+ dataset = dataset.batch(batchSize)
+
+ # Initialize arrays to store results
+ all_predictions = []
+ all_cat_weights = []
+ all_indices = []
+
+ # Iterate through batches manually to keep track of indices
+ for data, batch_indices in dataset:
+ batch_predictions = model.predict(
+ data,
+ batch_size=batchSize,
+ verbose=0) # Set verbose=0 to avoid multiple progress bars
+
+ # Apply softmax to scale to 0 to 1
+ batch_cat_weights = tf.nn.softmax(batch_predictions)
+
+ all_predictions.append(batch_predictions)
+ all_cat_weights.append(batch_cat_weights)
+ all_indices.append(batch_indices)
+
+ prog.item_progress(item, 0.4)
+
+ # Concatenate all results
+ predictions = tf.concat(all_predictions, axis=0)
+ catWeights = tf.concat(all_cat_weights, axis=0)
+ final_indices = tf.concat(all_indices, axis=0)
+
+ return catWeights.numpy(), predictions.numpy(), final_indices.numpy().astype(np.int64)
def findOptimalBatchSize(self, model, ds, training) -> int:
if training and self.training_optimal_batchsize is not None:
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
index e06d247..e8acb68 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
@@ -66,12 +66,10 @@ class _BayesianPatchTorchModel(bbald.consistent_mc_dropout.BayesianModule):
# A Bayesian model that takes patches (2-dimensional shape) rather than vectors
# (1-dimensional shape) as input. It is useful when feature != 'vector' and
# SuperpixelClassificationBase.certainty == 'batchbald'.
- def __init__(self, num_classes: int) -> None:
+ def __init__(self, num_classes: int, device : torch.device) -> None:
# Set `self.device` as early as possible so that other code does not lock out
# what we want.
- self.device: str = torch.device(
- ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'),
- )
+ self.device : torch.device = device
# print(f'Initial model.device = {self.device}')
super(_BayesianPatchTorchModel, self).__init__()
@@ -134,18 +132,16 @@ class _VectorTorchModel(torch.nn.Module):
# (2-dimensional shape) as input. It is useful when feature == 'vector' and
# SuperpixelClassificationBase.certainty != 'batchbald'.
- def __init__(self, input_dim: int, num_classes: int) -> None:
+ def __init__(self, input_dim: int, num_classes: int, device : torch.device) -> None:
# Set `self.device` as early as possible so that other code does not lock out
# what we want.
- self.device: str = torch.device(
- ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'),
- )
+ self.device: torch.device = device
# print(f'Initial model.device = {self.device}')
super(_VectorTorchModel, self).__init__()
self.input_dim: int = input_dim
self.num_classes: int = num_classes
- self.fc: torch.Module = torch.nn.Linear(input_dim, num_classes)
+ self.fc: torch.Linear = torch.nn.Linear(input_dim, num_classes)
def forward(self, input: torch.Tensor) -> torch.Tensor:
# TODO: Is torch.mul appropriate here?
@@ -161,20 +157,18 @@ class _BayesianVectorTorchModel(bbald.consistent_mc_dropout.BayesianModule):
# (2-dimensional shape) as input. It is useful when feature == 'vector' and
# SuperpixelClassificationBase.certainty == 'batchbald'.
- def __init__(self, input_dim: int, num_classes: int) -> None:
+ def __init__(self, input_dim: int, num_classes: int, device : torch.device) -> None:
# Set `self.device` as early as possible so that other code does not lock out
# what we want.
- self.device: str = torch.device(
- ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'),
- )
+ self.device = device
# print(f'Initial model.device = {self.device}')
super(_BayesianVectorTorchModel, self).__init__()
self.input_dim: int = input_dim
self.num_classes: int = num_classes
self.bayesian_samples: int = 12
- self.fc: torch.Module = torch.nn.Linear(input_dim, num_classes)
- self.fc_drop: torch.Module = bbald.consistent_mc_dropout.ConsistentMCDropout()
+ self.fc: torch.Linear = torch.nn.Linear(input_dim, num_classes)
+ self.fc_drop: torch.ConsistentMCDropout = bbald.consistent_mc_dropout.ConsistentMCDropout()
def mc_forward_impl(self, input: torch.Tensor) -> torch.Tensor:
# TODO: Is torch.mul appropriate here?
@@ -311,14 +305,17 @@ def trainModelDetails(
prog: ProgressHelper,
tempdir: str,
trainingSplit: float,
+ cuda : bool,
):
+ device = torch.device("cuda" if cuda else "cpu")
+ print(f"Using device: {device}")
# make model
num_classes: int = len(record['labels'])
model: torch.nn.Module
if self.feature_is_image:
# Feature is patch
if self.certainty == 'batchbald':
- model = _BayesianPatchTorchModel(num_classes)
+ model = _BayesianPatchTorchModel(num_classes, device)
else:
mesg = 'Expected torch model for input of type image to be Bayesian'
raise ValueError(mesg)
@@ -326,9 +323,9 @@ def trainModelDetails(
# Feature is vector
input_dim: int = record['ds'].shape[1]
if self.certainty == 'batchbald':
- model = _BayesianVectorTorchModel(input_dim, num_classes)
+ model = _BayesianVectorTorchModel(input_dim, num_classes, device)
else:
- model = _VectorTorchModel(input_dim, num_classes)
+ model = _VectorTorchModel(input_dim, num_classes, device)
model.to(model.device)
# print(f'Torch trainModelDetails(batchSize={batchSize}, ...)')
@@ -348,6 +345,7 @@ def trainModelDetails(
val_ds: torch.utils.data.TensorDataset
train_dl: torch.utils.data.DataLoader
val_dl: torch.utils.data.DataLoader
+ prog.message('Loading features for model training')
train_arg1 = (
torch.from_numpy(record['ds'][train_indices].transpose((0, 3, 2, 1)))
if self.feature_is_image
@@ -507,7 +505,7 @@ def fitModel(
return history
def predictLabelsForItemDetails(
- self, batchSize: int, ds_h5, item, model: torch.nn.Module, prog: ProgressHelper,
+ self, batchSize: int, ds_h5, indices, item, model: torch.nn.Module, use_cuda : bool, prog: ProgressHelper,
):
# print(f'Torch predictLabelsForItemDetails(batchSize={batchSize}, ...)')
num_superpixels: int = ds_h5.shape[0]
@@ -517,6 +515,9 @@ def predictLabelsForItemDetails(
num_classes: int = model.num_classes
# print(f'{num_classes = }')
+ # also set on model.device, ideally
+ #device = torch.device("cuda" if use_cuda else "cpu")
+
callbacks = [
_LogTorchProgress(prog, 1 + (num_superpixels - 1) // batchSize, 0.05, 0.35, item),
]
@@ -532,12 +533,13 @@ def predictLabelsForItemDetails(
for cb in callbacks:
cb.on_predict_begin(logs=logs)
+ # ds also needs to have information about the indices so that we can shuffle the data but still link it to an index
ds: torch.utils.data.TensorDataset = torch.utils.data.TensorDataset(
(
torch.from_numpy(np.array(ds_h5).transpose((0, 3, 2, 1)))
if self.feature_is_image
else torch.from_numpy(np.array(ds_h5))
- ),
+ ), torch.from_numpy(indices),
)
if batchSize < 1:
batchSize = self.findOptimalBatchSize(model, ds, training=False)
@@ -545,6 +547,7 @@ def predictLabelsForItemDetails(
dl: torch.utils.data.DataLoader = torch.utils.data.DataLoader(ds, batch_size=batchSize)
predictions: NDArray[np.float_] = np.zeros((num_superpixels, bayesian_samples, num_classes))
catWeights: NDArray[np.float_] = np.zeros((num_superpixels, bayesian_samples, num_classes))
+ outIndices: NDArray[np.int64] = np.zeros(num_superpixels, dtype=np.int64)
with torch.no_grad():
model.eval() # Tell torch that we will be doing predictions
row: int = 0
@@ -567,6 +570,8 @@ def predictLabelsForItemDetails(
catWeights_raw = torch.nn.functional.softmax(predictions_raw, dim=-1)
predictions[row:new_row, :, :] = predictions_raw.detach().cpu().numpy()
catWeights[row:new_row, :, :] = catWeights_raw.detach().cpu().numpy()
+ outIndices[row:new_row] = data[1].detach().cpu().numpy().astype(np.int64)[:]
+
row = new_row
for cb in callbacks:
cb.on_predict_batch_end(i)
@@ -574,7 +579,7 @@ def predictLabelsForItemDetails(
cb.on_predict_end({'outputs': predictions})
prog.item_progress(item, 0.4)
# scale to units
- return catWeights, predictions
+ return catWeights, predictions, outIndices
def findOptimalBatchSize(
self, model: torch.nn.Module, ds: torch.utils.data.TensorDataset, training: bool,
@@ -651,9 +656,14 @@ def add_safe_globals(self):
def loadModel(self, modelPath):
self.add_safe_globals()
- model = torch.load(modelPath)
- model.eval()
- return model
+ try:
+ model = torch.load(modelPath, weights_only=False)
+ model.eval()
+ return model
+ except Exception as e:
+ print(f"Unable to load {modelPath}")
+ raise
+
def saveModel(self, model, modelPath):
self.add_safe_globals()
diff --git a/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py b/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py
new file mode 100644
index 0000000..617ae86
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py
@@ -0,0 +1,193 @@
+''' Benchmark script for the SuperpixelClassificationTorch class
+Originally written by feeding "tests/test_torch.py" to ChatGPT and asking for a benchmarking using timeit.
+'''
+import shutil
+import numpy as np
+import h5py
+import os
+import tempfile
+import timeit
+from unittest.mock import MagicMock
+import csv
+import matplotlib.pyplot as plt
+from datetime import datetime
+
+from IPython.utils.path import ensure_dir_exists
+from more_itertools.more import side_effect
+from superpixel_classification.SuperpixelClassification.SuperpixelClassificationBase import SuperpixelClassificationBase
+from superpixel_classification.SuperpixelClassification.SuperpixelClassificationTorch import SuperpixelClassificationTorch
+from superpixel_classification.SuperpixelClassification.progress_helper import ProgressHelper
+
+import argparse
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Benchmark SuperpixelClassificationTorch.")
+ parser.add_argument('--mnist-image-size', type=int, default=100, help='patchsize of individual images')
+ parser.add_argument('--color-dim', type=int, default=3, help='Number of color channels')
+ parser.add_argument('--image-sizes', default=list(map(int, [1e3, 1e4])), help='Output path for the pyramidal TIF file')
+ parser.add_argument('--epochs', default=3, type=int, help='Number of epochs to train')
+ parser.add_argument('--out-dir', default='benchmark_results', type=str, help='default output directory for benchmark results')
+
+ return parser.parse_args()
+
+
+def create_sample_data(num_images, tmpdir, image_size, color_dim):
+ h5_path = os.path.join(tmpdir, "test_data.h5")
+ images = np.random.randint(0, 255, size=(num_images, image_size, image_size, color_dim), dtype=np.uint8)
+
+ with h5py.File(h5_path, 'w') as f:
+ f.create_dataset('images', data=images)
+ f.create_dataset('used_indices', data=np.arange(num_images - 2))
+
+ return h5_path
+
+def train_model(num_images, num_epochs, h5_path):
+ base: SuperpixelClassificationBase = SuperpixelClassificationTorch()
+ base.feature_is_image = True
+ base.certainty = 'batchbald'
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ return shutil.copy(h5_path, dst)
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(_, src):
+ dst = os.path.dirname(os.path.dirname(h5_path))
+ return shutil.copy(src, dst)
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ elem = {
+ 'girderId': 'test_girder_id',
+ 'categories': [
+ {"label": c} for c in labels
+ ],
+ 'values':
+ [] \
+ + np.random.randint(1, len(labels) - 1, size=(num_images - 2), dtype=np.uint8).tolist()
+ + [0, 0], # last two images unlabeled
+ 'transform': {'matrix': [[1.0]]}
+ }
+
+ item = {'_id': 'test_h5_file', 'name': 'test'}
+ annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}}
+ items = [(item, annotrec, elem)]
+
+ with ProgressHelper('Superpixel Classification', 'Test training', True) as prog:
+ prog.progress(0)
+ prog.items(items)
+ modelFile, modelTrainingFile = base.trainModel(
+ gc=gc,
+ annotationName="TorchTest",
+ itemsAndAnnot=items,
+ features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}},
+ modelFolderId="test_folder_id",
+ batchSize=4,
+ epochs=1,
+ trainingSplit=0.5,
+ randomInput=False,
+ labelList='',
+ excludeLabelList=[],
+ prog=prog,
+ use_cuda=True,
+ )
+
+ return modelFile, modelTrainingFile
+
+def create_benchmark_plot(results, out_dir):
+ plt.figure(figsize=(12, 6))
+
+ # Number of image sizes and runs
+ n_sizes = len(results)
+ n_runs = len(results[0]['times'])
+
+ # Create positions for bars
+ ind = np.arange(n_sizes)
+ width = 0.25 # Width of bars
+
+ # Plot bars for each run
+ for i in range(n_runs):
+ times = [result['times'][i] for result in results]
+ plt.bar(ind + i*width, times, width, label=f'Run {i+1}')
+
+ plt.xlabel('Number of Images')
+ plt.ylabel('Time (seconds)')
+ plt.title('Model Training Benchmark Times')
+
+ # Set x-axis labels
+ plt.xticks(ind + width, [str(result['num_images']) for result in results])
+
+ plt.legend()
+ plt.tight_layout()
+
+ # Save plot
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ dst_pth = os.path.join(out_dir, f'benchmark_results_{timestamp}.png')
+ plt.savefig(dst_pth)
+ plt.close()
+
+ return dst_pth
+
+def main():
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ args = parse_args()
+ ensure_dir_exists(args.out_dir)
+ csv_filename = os.path.join(args.out_dir, f'benchmark_results_{timestamp}.csv')
+ results = []
+
+ # Write CSV header
+ with open(csv_filename, 'w', newline='') as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(['Num Images', 'Run 1', 'Run 2', 'Run 3', 'Average', 'Best'])
+
+ for num_images in args.image_sizes:
+ print(f"\nBenchmarking with NUM_IMAGES = {num_images}")
+ with tempfile.TemporaryDirectory() as tmpdir:
+ h5_path = create_sample_data(num_images, tmpdir, args.mnist_image_size, args.color_dim)
+ timer = timeit.Timer(lambda: train_model(num_images, args.epochs, h5_path))
+
+ try:
+ times = timer.repeat(repeat=3, number=1)
+ avg_time = sum(times) / len(times)
+ best_time = min(times)
+
+ # Store results for plotting
+ results.append({
+ 'num_images': num_images,
+ 'times': times,
+ 'average': avg_time,
+ 'best': best_time
+ })
+
+ # Write results to CSV
+ with open(csv_filename, 'a', newline='') as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow([
+ num_images,
+ round(times[0], 3),
+ round(times[1], 3),
+ round(times[2], 3),
+ round(avg_time, 3),
+ round(best_time, 3)
+ ])
+
+ print(f"Times for each run (seconds): {[round(t, 3) for t in times]}")
+ print(f"Average time (seconds): {round(avg_time, 3)}")
+ print(f"Best time (seconds): {round(best_time, 3)}")
+
+ except Exception as e:
+ print(f"Error during benchmark: {str(e)}")
+ # Write error to CSV
+ with open(csv_filename, 'a', newline='') as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow([num_images, f"Error: {str(e)}", "", "", "", ""])
+ finally:
+ shutil.rmtree(tmpdir)
+
+ # Create and save the plot
+ out_file = create_benchmark_plot(results, args.out_dir)
+ print(f"\nResults saved to {csv_filename}")
+ print(f"Plot saved as {out_file}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py b/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py
new file mode 100644
index 0000000..9d7e121
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+'''
+Generate a .tiff with numbers from MNIST
+'''
+
+import os
+import argparse
+import random
+
+import numpy as np
+import pandas as pd
+import tifffile
+from PIL import Image
+from torchvision.datasets import MNIST
+
+def parse_args():
+ # Parse arguments
+ parser = argparse.ArgumentParser(description="Generate a pyramidal MNIST image.")
+ parser.add_argument('--root_dataset_path', type=str, default="/data/aza4423_anders/mnist", help='Path to download and store MNIST dataset')
+ #parser.add_argument('--num_images', type=int, default=244 * 244, help='Number of random MNIST images to use')
+ parser.add_argument('--num_images', type=int, default=4, help='Number of random MNIST images to use')
+ parser.add_argument('--output_path', type=str, default="/data/aza4423_anders/aml-dsa/mnist_pyramid.tif", help='Output path for the pyramidal TIF file')
+ parser.add_argument('--test', default=False, type=bool, action=argparse.BooleanOptionalAction,
+ metavar='T',
+ help='whether to use test MNIST or train'
+ )
+
+ args = parser.parse_args()
+
+ return args
+
+def d_to_rgb(d):
+ r = d & 0xFF
+ g = (d >> 8) & 0xFF
+ b = (d >> 16) & 0xFF
+ return [r, g, b]
+
+
+def create_mnist_image(root_dataset_path=".", num_images=100, output_path="./out", test=False, start_value=0):
+ # verify that num_images has a square root; otherwise we'd have to insert blank tiles for the uneven grid
+ assert num_images % np.sqrt(num_images) == 0
+
+ # Download MNIST (if not already downloaded)
+ dataset = MNIST(root=root_dataset_path, train=not test, download=True)
+
+ # Select N random MNIST images (each image is PIL.Image in mode "L")
+ # (Make the number square-rootable)
+ num_images = num_images # Number of images from argument
+ # oversample if we want more images than the length of MNIST
+ if num_images > len(dataset):
+ indices = random.choices(range(len(dataset)), k=num_images)
+ else:
+ indices = list(range(num_images))
+ random.shuffle(indices)
+
+ #indices = random.sample(range(len(dataset)), num_images)
+ mnist_images = [np.array(dataset[i][0]) for i in indices] # each is 28x28, uint8
+ mnist_labels = [np.array(dataset[i][1]) for i in indices]
+
+ # Arrange the images in a grid (so num_images should be a number with an integer root)
+ tile_rows, tile_cols = int(np.sqrt(num_images)), int(np.sqrt(num_images))
+ tile_h, tile_w = mnist_images[0].shape # typically 28x28
+ grid_h, grid_w = tile_rows * tile_h, tile_cols * tile_w
+ base_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8)
+ pm_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8)
+
+ for idx, img in enumerate(mnist_images):
+ r = idx // tile_cols
+ c = idx % tile_cols
+ # convert img to RGB
+ rgb_img = np.stack([img, img, img], axis=-1)
+ base_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = rgb_img
+
+ value_img = np.zeros((tile_h, tile_w, 3), dtype=np.uint8)
+ i = (idx + 1) * 2
+ rgb = d_to_rgb(i + start_value)
+ value_img[1:-1, 1:-1] = rgb
+ rgb = d_to_rgb(i + start_value + 1)
+ value_img[0, :] = rgb
+ value_img[-1, :] = rgb
+ value_img[:, 0] = rgb
+ value_img[:, -1] = rgb
+
+ pm_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = value_img
+
+
+ # Note: We assume that the base level corresponds to 40x magnification.
+ # Now, build a pyramid (list of downsampled images).
+ pyramid_pm = [pm_image]
+ pm_current = pm_image.copy()
+
+ pyramid = [base_image]
+ current = base_image.copy()
+ # Continue downsampling by a factor of 2 until one dimension becomes very small.
+ while min(current.shape) >= 64:
+ # Use Pillow to resize (ANTIALIAS gives good quality downsampling)
+ im = Image.fromarray(current)
+ new_w, new_h = current.shape[1] // 2, current.shape[0] // 2
+ if new_w < 1 or new_h < 1:
+ break
+ im_resized = im.resize((new_w, new_h))
+ current = np.array(im_resized)
+ pyramid.append(current)
+
+ im = Image.fromarray(pm_image)
+ new_w, new_h = pm_current.shape[1] // 2, pm_current.shape[0] // 2
+ if new_w < 1 or new_h < 1:
+ break
+ im_resized = im.resize((new_w, new_h))
+ pm_current = np.array(im_resized)
+ pyramid_pm.append(current)
+
+ # Save the image as a pyramidal TIFF.
+ # The base image is the main image and the pyramid list (excluding the base) is saved as subIFDs.
+ output_filename = output_path # Use the output path from argument
+ if os.path.dirname(output_filename):
+ os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+ if os.path.exists(output_filename):
+ os.remove(output_filename)
+
+ with tifffile.TiffWriter(output_filename, bigtiff=False) as tif:
+ tif.write(pyramid[0],
+ tile=(tile_w * 4, tile_h * 4),
+ photometric='RGB',
+ description='Whole-slide MNIST image at 40x magnification',
+ subifds=pyramid[1:])
+ print(f"Pyramidal TIFF saved as {output_filename}")
+
+ output_filename_pm = output_filename + ".pixelmap.tiff" # Use the output path from argument
+ if os.path.dirname(output_filename_pm):
+ os.makedirs(os.path.dirname(output_filename_pm), exist_ok=True)
+ if os.path.exists(output_filename_pm):
+ os.remove(output_filename_pm)
+ with tifffile.TiffWriter(output_filename_pm, bigtiff=False) as tif:
+ tif.write(pyramid_pm[0],
+ tile=(tile_w * 4, tile_h * 4),
+ photometric='RGB',
+ description='Pixelmap for Whole-slide MNIST image at 40x magnification',
+ subifds=pyramid_pm[1:])
+ print(f"Pyramidal TIFF saved as {output_filename_pm}")
+
+ # generate a corresponding CSV "cells" file
+ # with headers "x,y,w,h" for each image
+ csv_filename = output_filename + "_cells.csv"
+ with open(csv_filename, 'w') as f:
+ f.write("x,y,w,h,value\n")
+ i = 0
+ for r in range(tile_rows):
+ for c in range(tile_cols):
+ x, y = c * tile_w, r * tile_h
+ f.write(f"{x},{y},{tile_w},{tile_h},{mnist_labels[i]}\n")
+ i += 1
+ df = pd.read_csv(csv_filename, header=0)
+ print(f"Annotation file saved as {csv_filename}")
+ return output_filename, output_filename_pm, df
+
+if __name__ == "__main__":
+ _args = parse_args()
+ create_mnist_image(_args.root_dataset_path, _args.num_images, _args.output_path, _args.test)
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py b/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py
new file mode 100644
index 0000000..2c17864
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py
@@ -0,0 +1,218 @@
+import os
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock
+
+import h5py
+import large_image
+import numpy as np
+import pytest
+
+# make pythonpath work out of the box - although your editor may complain
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from progress_helper import ProgressHelper
+from tests.generate_MNIST_image import create_mnist_image
+
+from xdg_base_dirs import ( xdg_cache_home, )
+
+NUM_IMAGES = 64
+
+@pytest.fixture(scope="session")
+def create_sample_data():
+ global NUM_IMAGES
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ tiff_path = os.path.join(tmpdirname, "test_mnist.tiff")
+ #tiff_path_pm = os.path.join(tmpdirname, "test_mnist.tiff.pixelmap.tiff")
+
+ tiff_path, tiff_path_pm, labels = create_mnist_image(
+ root_dataset_path=xdg_cache_home(),
+ num_images=NUM_IMAGES,
+ output_path=tiff_path,
+ test=False,
+ )
+ # 0 is background
+ labels['value'] = labels['value'] + 1
+
+ # we use yield so that the temporarydirectory is still open in the tests
+ yield tiff_path, tiff_path_pm, NUM_IMAGES, labels
+
+MNIST_IMAGE_SIZE=28
+COLOR_DIM = 3
+
+def test_cutoff(create_sample_data):
+ global MNIST_IMAGE_SIZE, COLOR_DIM
+ test_image_pth, test_image_pth_pm, num_images, labels = create_sample_data
+ base = SuperpixelClassificationBase()
+
+ # Create test data
+ item = {
+ 'name': test_image_pth,
+ 'largeImage': {'fileId': 'test_image_id'}
+ }
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ if "pixelmap" in dst:
+ if not os.path.exists(dst):
+ return shutil.copy(test_image_pth_pm, dst)
+ else:
+ if not os.path.exists(dst):
+ return shutil.copy(test_image_pth, dst)
+ return None
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ gc.getItem = MagicMock(return_value={'name': test_image_pth_pm, 'largeImage': {'fileId': 'foobar'}})
+ def mv_to_src(_, src):
+ dst = os.path.dirname(test_image_pth)
+ return shutil.copy(src, dst)
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'})
+ #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'})
+
+ bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()]
+
+ elem = {
+ 'girderId': 'test_girder_id',
+ 'values':
+ [] \
+ + list(labels['value'])[:-2]
+ + [0, 0], # last two images unlabeled
+ 'user': {
+ 'bbox': [item for sublist in bboxes for item in sublist]
+ },
+ 'transform': {'matrix': [[1.0]]}
+ }
+
+ filename = 'test_features.h5'
+ h5_file = os.path.join(os.path.dirname(test_image_pth), filename)
+ if os.path.exists(h5_file):
+ os.remove(h5_file)
+
+ assert not os.path.exists(h5_file)
+
+ cutoff = 1
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items([item])
+ result = base.createFeaturesForItem(
+ gc=gc,
+ item=item,
+ elem=elem,
+ featureFolderId='test_folder_id',
+ fileName=filename,
+ patchSize=MNIST_IMAGE_SIZE,
+ prog=prog,
+ cutoff=cutoff,
+ )
+
+ assert os.path.exists(h5_file), f"Output file {h5_file} does not exist"
+ with h5py.File(h5_file, 'r') as ffptr:
+ assert 'images' in ffptr
+ assert ffptr['images'].shape == (NUM_IMAGES - cutoff, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM)
+ assert len(ffptr['used_indices']) == NUM_IMAGES - cutoff # number of labeled - cutoff
+
+def test_create_features_for_item(create_sample_data):
+ global MNIST_IMAGE_SIZE, COLOR_DIM
+ test_image_pth, test_image_pth_pm, num_images, labels = create_sample_data
+ base = SuperpixelClassificationBase()
+
+ # Create test data
+ item = {
+ 'name': test_image_pth,
+ 'largeImage': {'fileId': 'test_image_id'}
+ }
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ if "pixelmap" in dst:
+ if not os.path.exists(dst):
+ return shutil.copy(test_image_pth_pm, dst)
+ else:
+ if not os.path.exists(dst):
+ return shutil.copy(test_image_pth, dst)
+ return None
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ gc.getItem = MagicMock(return_value={'name': test_image_pth_pm, 'largeImage': {'fileId': 'foobar'}})
+ def mv_to_src(_, src):
+ dst = os.path.dirname(test_image_pth)
+ return shutil.copy(src, dst)
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'})
+ #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'})
+
+ bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()]
+
+ elem = {
+ 'girderId': 'test_girder_id',
+ 'values':
+ [] \
+ + list(labels['value'])[:-2]
+ + [0, 0], # last two images unlabeled
+ 'user': {
+ 'bbox': [item for sublist in bboxes for item in sublist]
+ },
+ 'transform': {'matrix': [[1.0]]}
+ }
+
+ filename = 'test_features.h5'
+ h5_file = os.path.join(os.path.dirname(test_image_pth), filename)
+ if os.path.exists(h5_file):
+ os.remove(h5_file)
+
+ assert not os.path.exists(h5_file)
+
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items([item])
+ result = base.createFeaturesForItem(
+ gc=gc,
+ item=item,
+ elem=elem,
+ featureFolderId='test_folder_id',
+ fileName=filename,
+ patchSize=MNIST_IMAGE_SIZE,
+ prog=prog,
+ cutoff=9999
+ )
+
+ assert os.path.exists(h5_file), f"Output file {h5_file} does not exist"
+ with h5py.File(h5_file, 'r') as ffptr:
+ assert 'images' in ffptr
+ assert ffptr['images'].shape == (num_images, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM)
+ feature_img = ffptr['images'][0]
+ # open test_image_pth using coordinates [x,y,w,h] from elem['user']['bbox'][:4] and make sure it's pixel-equal with first_img
+ x, y, x2, y2 = elem['user']['bbox'][:4]
+ ts = large_image.getTileSource(test_image_pth)
+ orig_image = ts.getRegion(
+ region=dict(left=x, top=y, right=x2, bottom=y2),
+ format=large_image.tilesource.TILE_FORMAT_NUMPY
+ )[0]
+ orig_image = orig_image.astype(feature_img.dtype)
+ print(orig_image.dtype)
+ np.testing.assert_array_equal(orig_image, feature_img)
+
+ # also check that the last image matches
+ feature_img = ffptr['images'][-1]
+ x, y, x2, y2 = elem['user']['bbox'][-4:]
+ ts = large_image.getTileSource(test_image_pth)
+ orig_image = ts.getRegion(
+ region=dict(left=x, top=y, right=x2, bottom=y2),
+ format=large_image.tilesource.TILE_FORMAT_NUMPY
+ )[0]
+ orig_image = orig_image.astype(feature_img.dtype)
+ print(orig_image.dtype)
+ np.testing.assert_array_equal(orig_image, feature_img)
+
+ assert 'used_indices' in ffptr
+ assert len(ffptr['used_indices']) == num_images
+
+ # Assertions
+ assert result == h5_file
+ assert gc.downloadFile.call_count == 2 # Called for both image and mask
+ assert gc.getItem.call_count == 1
+ assert gc.uploadFileToFolder.call_count == 1
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py b/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py
new file mode 100644
index 0000000..03c6b8a
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py
@@ -0,0 +1,524 @@
+'''
+This file contains tests for a full training cycle: extracting superpixels, training and evaluation.
+The "cycle" is:
+ 1. generate NUM_WSIS different whole slide images using numbers from MNIST.
+ 2. extract features from said images.
+ 3. train a model on the features.
+ 4. evaluate the model on the features.
+We expect an accuracy of at least 90%.
+
+This test is to verify that the training cycle works as expected.
+Since there is batching involved, we want to use a larger number of samples instead of just a quick mini-test, as found in the other files.
+'''
+import argparse
+import glob
+import json
+import os
+import re
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+from xdg_base_dirs import (xdg_cache_home, )
+
+# make pythonpath work out of the box - although your editor may complain
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from SuperpixelClassificationTensorflow import SuperpixelClassificationTensorflow
+from SuperpixelClassificationTorch import SuperpixelClassificationTorch
+from tests.generate_MNIST_image import create_mnist_image
+
+NUM_WSIS = 2
+MNIST_IMAGE_SIZE = 28
+NUM_IMAGES_PER_WSI = 10 ** 2
+COLOR_DIM = 3
+PATCH_SIZE = 100 # only size compatible with pytorch model for the time being (since there are hardcoded sizes in the definition of the model)
+NUM_EPOCHS = 5
+
+@pytest.fixture(scope="function")
+def create_sample_data(request):
+ global NUM_WSIS, NUM_IMAGES_PER_WSI
+ wsi_paths, pm_paths, list_labels = [], [], []
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ for i in range(NUM_WSIS):
+ tiff_path = os.path.join(tmpdirname, f"test_mnist_{i}.tiff")
+
+ tiff_path, tiff_path_pm, labels = create_mnist_image(
+ root_dataset_path=xdg_cache_home(),
+ num_images=NUM_IMAGES_PER_WSI,
+ output_path=tiff_path,
+ test=False,
+ start_value = request.param
+ )
+ # where labels['value'] == 0, put 10 instead, since 0 will be reserved for unlabeled
+ labels.loc[labels['value'] == 0, 'value'] = 10
+
+ wsi_paths.append(tiff_path)
+ pm_paths.append(tiff_path_pm)
+ list_labels.append(labels)
+
+ # we use yield so that the temporarydirectory is still open in the tests
+ yield wsi_paths, pm_paths, NUM_WSIS, list_labels
+
+@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~5-10 min), run only if you want to")
+@pytest.mark.parametrize('create_sample_data', [0], indirect=True)
+def test_main_pytorch(create_sample_data):
+ global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS
+ tiff_paths, tiff_path_pms, num_images, labels = create_sample_data
+ base: SuperpixelClassificationBase = SuperpixelClassificationTorch()
+
+ annotation_name = 'torchMNISTtest'
+ config = dict(
+ annotationDir = 'annotationdir',
+ annotationName = annotation_name,
+ batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time
+ certainty = 'batchbald',
+ cutoff = 600000, # plenty of space to allow all training samples
+ epochs = NUM_EPOCHS,
+ exclude = [],
+ feature = 'patch',
+ features = 'featuredir',
+ gensuperpixels = False,
+ girderApiUrl = 'http://localhost:8080/api/v1',
+ girderToken = '',
+ heatmaps = False,
+ images = 'imagedir',
+ labels = '',
+ magnification = 40.0,
+ modeldir = '',
+ numWorkers = 1,
+ patchSize = PATCH_SIZE,
+ radius = MNIST_IMAGE_SIZE,
+ randominput = False,
+ split = 0.7,
+ train = True,
+ useCuda = True,
+ progress = True,
+ )
+ args = argparse.Namespace(**config)
+
+ mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
+
+ items = []
+ for i in range(NUM_WSIS):
+ bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()]
+ elem = {
+ 'girderId': f'test_girder_id{i}',
+ 'categories': [
+ {"label": c} for c in mnist_labels
+ ],
+ 'values': labels[i]['value'].tolist(),
+ 'user': {
+ 'bbox': [item for sublist in bboxes for item in sublist]
+ },
+ 'transform': {'matrix': [[1.0]]}
+ }
+ item = {
+ '_id': f'test_file{i}',
+ 'name': os.path.basename(tiff_paths[i]),
+ 'largeImage': {'fileId': f'test_image_id{i}'},
+ }
+ mask_item = {
+ '_id': f'test_file{i}',
+ 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]),
+ 'largeImage': {'fileId': f'test_mask_id{i}'},
+ }
+ annotrec = {
+ '_id': f'test_file{i}',
+ '_version': 0,
+ 'annotation': {'name': 'TorchTest'},
+ }
+ items.append((item, annotrec, elem))
+
+
+ gc = MagicMock()
+ base.getItemsAndAnnotations = MagicMock(return_value=items)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ def mv_to_dst(req_pth : str, dst : str):
+ if req_pth.startswith("test_"):
+ for f in tiff_paths + tiff_path_pms:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"Copied {f} to {dst}")
+ elif req_pth.startswith("feature"):
+ feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5"))
+ for f in feature_files:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"Copied {f} to {dst}")
+ elif req_pth.endswith("model"):
+ model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.pth"))[0]
+ shutil.copy(model_file, dst)
+ elif "modtraining" in req_pth:
+ model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0]
+ shutil.copy(model_file, dst)
+ else:
+ print(f"Received unknown request path '{req_pth}'")
+ return {}
+
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(req, src, reference=None):
+ shutil.copy(src, tmpdirname)
+ print(f"Copied {src} to {tmpdirname}")
+ # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later
+ if src.endswith(".anot"):
+ # extract the number at the end of req, which can look like "testfile1" or "testfile1000"
+ m = re.search(r'(\d+)$', req)
+ num = int(m.group(1))
+ s = os.path.basename(src).replace(".anot", f"_{num}.myanot")
+ shutil.copy(src, os.path.join(tmpdirname, s))
+ print(f"Also copied {s} to {tmpdirname}")
+ return {'_id': 'feature', 'name': os.path.basename(src)}
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ gc.getItem = MagicMock(return_value=mask_item)
+
+ gc.listResource = MagicMock(return_value=[dict(name=f"{annotation_name}model", _id = 'model'), dict(name=f"{annotation_name}modtraining", _id = 'modtraining')])
+ gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True)
+ gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id'))
+
+ def list_file(req: str, limit: int = 0) -> iter:
+ if "modtraining" in req:
+ return iter([dict(name=req, _id = 'modtraining')])
+ else:
+ return iter([dict(name=req, _id='model')])
+ gc.listFile = MagicMock(side_effect=list_file)
+
+ base.main(args, gc)
+
+ for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))):
+ assert os.path.exists(file)
+ with open(file, 'r') as f:
+ pred_json = json.load(f)
+ e = pred_json['elements'][0]
+ assert len(e['values']) == NUM_IMAGES_PER_WSI
+
+ assert len(e['user']['bbox']) == NUM_IMAGES_PER_WSI * 4 # 4 is for x,y,w,h
+
+ assert len(e['categories']) == len(mnist_labels) - 1 # -1 because we don't have a default category
+ assert len(e['user']['confidence']) == NUM_IMAGES_PER_WSI
+
+ # compare e['values'] to labels['values'], to make sure we've trained a valid model
+ # the order of the values is shuffled in the annotation file, the ordering is in e['categories']
+ file_num = int(file.split('Predictions_')[-1].split('.myanot')[0])
+ predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']])
+ matches = (predicted_labels == np.array(list(map(str, labels[file_num]['value']))))
+ similarity = matches.sum() / len(matches)
+ expected_min_accuracy = 0.75
+ assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}"
+ print(f"Similarity between predicted values and GT is {similarity}")
+
+@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~1-10 min), run only if you want to")
+@pytest.mark.parametrize('create_sample_data', [0], indirect=True)
+def test_main_tf(create_sample_data):
+ global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS
+ tiff_paths, tiff_path_pms, num_images, labels = create_sample_data
+ base: SuperpixelClassificationBase = SuperpixelClassificationTensorflow()
+
+ annotation_name = 'tensorflowMNISTtest'
+ config = dict(
+ annotationDir = 'annotationdir',
+ annotationName = annotation_name,
+ batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time
+ certainty = 'confidence',
+ cutoff = 600000, # plenty of space to allow all training samples
+ epochs = NUM_EPOCHS,
+ exclude = [],
+ feature = 'patch',
+ features = 'featuredir',
+ gensuperpixels = False,
+ girderApiUrl = 'http://localhost:8080/api/v1',
+ girderToken = '',
+ heatmaps = False,
+ images = 'imagedir',
+ labels = '',
+ magnification = 40.0,
+ modeldir = 'modeldir',
+ numWorkers = 1,
+ patchSize = PATCH_SIZE,
+ radius = MNIST_IMAGE_SIZE,
+ randominput = False,
+ split = 0.7,
+ train = True,
+ useCuda = False,
+ progress = True,
+ )
+ args = argparse.Namespace(**config)
+
+ mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
+
+ items = []
+ for i in range(NUM_WSIS):
+ bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()]
+ elem = {
+ 'girderId': f'test_girder_id{i}',
+ 'categories': [
+ {"label": c} for c in mnist_labels
+ ],
+ 'values': labels[i]['value'].tolist(),
+ 'user': {
+ 'bbox': [item for sublist in bboxes for item in sublist]
+ },
+ 'transform': {'matrix': [[1.0]]}
+ }
+ item = {
+ '_id': f'test_file{i}',
+ 'name': os.path.basename(tiff_paths[i]),
+ 'largeImage': {'fileId': f'test_image_id{i}'},
+ }
+ mask_item = {
+ '_id': f'test_file{i}',
+ 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]),
+ 'largeImage': {'fileId': f'test_mask_id{i}'},
+ }
+ annotrec = {
+ '_id': f'test_file{i}',
+ '_version': 0,
+ 'annotation': {'name': 'TorchTest'},
+ }
+ items.append((item, annotrec, elem))
+
+
+ gc = MagicMock()
+ base.getItemsAndAnnotations = MagicMock(return_value=items)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ def mv_to_dst(req_pth : str, dst : str):
+ if req_pth.startswith("test_"):
+ for f in tiff_paths + tiff_path_pms:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"MockDownload: Copied {f} to {dst}")
+ elif req_pth.startswith("feature"):
+ feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5"))
+ for f in feature_files:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"MockDownload: Copied {f} to {dst}")
+ elif req_pth.endswith("model"):
+ model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.h5"))[0]
+ shutil.copy(model_file, dst)
+ elif "modtraining" in req_pth:
+ model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0]
+ shutil.copy(model_file, dst)
+ else:
+ raise RuntimeError(f"Received unknown request path '{req_pth}'")
+ return {}
+
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(req, src, reference=None):
+ shutil.copy(src, tmpdirname)
+ print(f"MockUpload: Copied {src} to {tmpdirname}")
+ # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later
+ if src.endswith(".anot"):
+ # extract the number at the end of req, which can look like "testfile1" or "testfile1000"
+ m = re.search(r'(\d+)$', req)
+ num = int(m.group(1))
+ s = os.path.basename(src).replace(".anot", f"_{num}.myanot")
+ shutil.copy(src, os.path.join(tmpdirname, s))
+ print(f"Also copied {s} to {tmpdirname}")
+ return {'_id': 'feature', 'name': os.path.basename(src)}
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ gc.getItem = MagicMock(return_value=mask_item)
+
+ modelName = f"{annotation_name} Model Epoch 0.h5"
+ modTrainingName = f"{annotation_name} ModTraining Epoch 0.h5"
+ gc.listResource = MagicMock(return_value=[dict(name=modelName, _id = 'model'), dict(name=modTrainingName, _id = 'modtraining')])
+ gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True)
+ gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id'))
+
+ def list_file(req: str, limit: int = 0) -> iter:
+ if "modtraining" in req:
+ return iter([dict(name=modTrainingName, _id = 'modtraining')])
+ else:
+ return iter([dict(name=modelName, _id='model')])
+ gc.listFile = MagicMock(side_effect=list_file)
+
+ base.main(args, gc)
+
+ for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))):
+ assert os.path.exists(file)
+ with open(file, 'r') as f:
+ pred_json = json.load(f)
+ e = pred_json['elements'][0]
+ assert len(e['values']) == NUM_IMAGES_PER_WSI
+
+ assert len(e['user']['bbox']) == NUM_IMAGES_PER_WSI * 4 # 4 is for x,y,w,h
+
+ assert len(e['categories']) == len(mnist_labels) - 1 # exclude the default category
+ assert len(e['user']['confidence']) == NUM_IMAGES_PER_WSI
+
+ # compare e['values'] to labels['values'], to make sure we've trained a valid model
+ # the order of the values is shuffled in the annotation file, the ordering is in e['categories']
+ file_num = int(file.split('Predictions_')[-1].split('.myanot')[0])
+ predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']])
+ matches = (predicted_labels == np.array(list(map(str, labels[file_num]['value']))))
+ similarity = matches.sum() / len(matches)
+ expected_min_accuracy = 0.75
+ assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}"
+ print(f"Similarity between predicted values and GT is {similarity}")
+
+@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~1-10 min), run only if you want to")
+@pytest.mark.parametrize('create_sample_data', [2], indirect=True)
+def test_main_tf_with_background(create_sample_data):
+ global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS
+ tiff_paths, tiff_path_pms, num_images, labels = create_sample_data
+ base: SuperpixelClassificationBase = SuperpixelClassificationTensorflow()
+
+ annotation_name = 'tensorflowMNISTtest'
+ config = dict(
+ annotationDir = 'annotationdir',
+ annotationName = annotation_name,
+ batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time
+ certainty = 'confidence',
+ cutoff = 600000, # plenty of space to allow all training samples
+ epochs = NUM_EPOCHS,
+ exclude = [],
+ feature = 'patch',
+ features = 'featuredir',
+ gensuperpixels = False,
+ girderApiUrl = 'http://localhost:8080/api/v1',
+ girderToken = '',
+ heatmaps = False,
+ images = 'imagedir',
+ labels = '',
+ magnification = 40.0,
+ modeldir = 'modeldir',
+ numWorkers = 1,
+ patchSize = PATCH_SIZE,
+ radius = MNIST_IMAGE_SIZE,
+ randominput = False,
+ split = 0.7,
+ train = True,
+ useCuda = False,
+ progress = True,
+ )
+ args = argparse.Namespace(**config)
+
+ mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
+
+ items = []
+ for i in range(NUM_WSIS):
+ bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()]
+ elem = {
+ 'girderId': f'test_girder_id{i}',
+ 'categories': [
+ {"label": c} for c in mnist_labels
+ ],
+ 'values': [0] + labels[i]['value'].tolist(),
+ 'user': {
+ 'bbox': [0,0,1,1] + [item for sublist in bboxes for item in sublist]
+ },
+ 'transform': {'matrix': [[1.0]]}
+ }
+ item = {
+ '_id': f'test_file{i}',
+ 'name': os.path.basename(tiff_paths[i]),
+ 'largeImage': {'fileId': f'test_image_id{i}'},
+ }
+ mask_item = {
+ '_id': f'test_file{i}',
+ 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]),
+ 'largeImage': {'fileId': f'test_mask_id{i}'},
+ }
+ annotrec = {
+ '_id': f'test_file{i}',
+ '_version': 0,
+ 'annotation': {'name': 'TorchTest'},
+ }
+ items.append((item, annotrec, elem))
+
+
+ gc = MagicMock()
+ base.getItemsAndAnnotations = MagicMock(return_value=items)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ def mv_to_dst(req_pth : str, dst : str):
+ if req_pth.startswith("test_"):
+ for f in tiff_paths + tiff_path_pms:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"MockDownload: Copied {f} to {dst}")
+ elif req_pth.startswith("feature"):
+ feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5"))
+ for f in feature_files:
+ dpath = os.path.join(dst, os.path.basename(f))
+ if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst):
+ shutil.copy(f, dst)
+ print(f"MockDownload: Copied {f} to {dst}")
+ elif req_pth.endswith("model"):
+ model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.h5"))[0]
+ shutil.copy(model_file, dst)
+ elif "modtraining" in req_pth:
+ model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0]
+ shutil.copy(model_file, dst)
+ else:
+ raise RuntimeError(f"Received unknown request path '{req_pth}'")
+ return {}
+
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(req, src, reference=None):
+ shutil.copy(src, tmpdirname)
+ print(f"MockUpload: Copied {src} to {tmpdirname}")
+ # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later
+ if src.endswith(".anot"):
+ # extract the number at the end of req, which can look like "testfile1" or "testfile1000"
+ m = re.search(r'(\d+)$', req)
+ num = int(m.group(1))
+ s = os.path.basename(src).replace(".anot", f"_{num}.myanot")
+ shutil.copy(src, os.path.join(tmpdirname, s))
+ print(f"Also copied {s} to {tmpdirname}")
+ return {'_id': 'feature', 'name': os.path.basename(src)}
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ gc.getItem = MagicMock(return_value=mask_item)
+
+ modelName = f"{annotation_name} Model Epoch 0.h5"
+ modTrainingName = f"{annotation_name} ModTraining Epoch 0.h5"
+ gc.listResource = MagicMock(return_value=[dict(name=modelName, _id = 'model'), dict(name=modTrainingName, _id = 'modtraining')])
+ gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True)
+ gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id'))
+
+ def list_file(req: str, limit: int = 0) -> iter:
+ if "modtraining" in req:
+ return iter([dict(name=modTrainingName, _id = 'modtraining')])
+ else:
+ return iter([dict(name=modelName, _id='model')])
+ gc.listFile = MagicMock(side_effect=list_file)
+
+ base.main(args, gc)
+
+ for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))):
+ assert os.path.exists(file)
+ with open(file, 'r') as f:
+ pred_json = json.load(f)
+ e = pred_json['elements'][0]
+ assert len(e['values']) == NUM_IMAGES_PER_WSI + 1
+
+ assert len(e['user']['bbox']) == (NUM_IMAGES_PER_WSI + 1) * 4 # 4 is for x,y,w,h
+
+ assert len(e['categories']) == len(mnist_labels) - 1 # exclude the default category
+ assert len(e['user']['confidence']) == (NUM_IMAGES_PER_WSI + 1)
+
+ # compare e['values'] to labels['values'], to make sure we've trained a valid model
+ # the order of the values is shuffled in the annotation file, the ordering is in e['categories']
+ file_num = int(file.split('Predictions_')[-1].split('.myanot')[0])
+ predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']])
+ assert e['values'][0] == 0, "Background should have prediction 0"
+ matches = (predicted_labels == np.array([e['values'][0]] + list(map(str, labels[file_num]['value']))))
+ similarity = matches.sum() / len(matches)
+ expected_min_accuracy = 0.75
+ assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}"
+ print(f"Similarity between predicted values and GT is {similarity}")
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py b/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py
new file mode 100644
index 0000000..5fc814f
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py
@@ -0,0 +1,164 @@
+import os
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock
+
+import h5py
+import large_image
+import numpy as np
+import pytest
+from PIL.Image import Image
+from tifffile import tifffile
+
+# make pythonpath work out of the box - although your editor may complain
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from progress_helper import ProgressHelper
+from tests.generate_MNIST_image import create_mnist_image
+
+from xdg_base_dirs import ( xdg_cache_home, )
+
+NUM_IMAGES : int = 9
+IMAGE_SIZE : int = 16 # 16 is the smallest tile size for .TIFFs, although we could operate within a single tile, too.
+COLOR_DIM = 3
+
+
+def d_to_rgb(d):
+ r = d & 0xFF
+ g = (d >> 8) & 0xFF
+ b = (d >> 16) & 0xFF
+ return [r, g, b]
+
+@pytest.fixture(scope="session")
+def create_sample_data():
+ '''
+ Create a sample WSI for testing.
+ '''
+ global NUM_IMAGES, IMAGE_SIZE
+ num_images = NUM_IMAGES
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ output_filename = os.path.join(tmpdirname, "test.tiff")
+
+ if os.path.dirname(output_filename):
+ os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+ if os.path.exists(output_filename):
+ os.remove(output_filename)
+
+ # Arrange the images in a grid (so num_images should be a number with an integer root)
+ tile_rows, tile_cols = int(np.sqrt(num_images)), int(np.sqrt(num_images))
+ tile_h, tile_w = 16, 16
+ grid_h, grid_w = tile_rows * tile_h, tile_cols * tile_w
+ base_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8)
+
+ vals = np.array([0, 127, 255], dtype=np.uint8)
+ colors = np.stack(np.meshgrid(vals, vals, vals), axis=-1).reshape(-1, 3)[:NUM_IMAGES]
+ images = np.tile(colors[:, None, None, :], (1, IMAGE_SIZE, IMAGE_SIZE, 1))
+
+ for idx, img in enumerate(images):
+ r = idx // tile_cols
+ c = idx % tile_cols
+ base_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = img
+
+ pyramid = [base_image]
+ current = base_image.copy()
+ while min(current.shape) >= 64:
+ # Use Pillow to resize (ANTIALIAS gives good quality downsampling)
+ im = Image.fromarray(current)
+ new_w, new_h = current.shape[1] // 2, current.shape[0] // 2
+ if new_w < 1 or new_h < 1:
+ break
+ im_resized = im.resize((new_w, new_h))
+ current = np.array(im_resized)
+ pyramid.append(current)
+
+ # Save the image as a pyramidal TIFF.
+ # The base image is the main image and the pyramid list (excluding the base) is saved as subIFDs.
+ if os.path.dirname(output_filename):
+ os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+ if os.path.exists(output_filename):
+ os.remove(output_filename)
+
+ with tifffile.TiffWriter(output_filename, bigtiff=False) as tif:
+ tif.write(pyramid[0],
+ tile=(tile_w * 4, tile_h * 4),
+ photometric='RGB',
+ description='Whole-slide MNIST image at 40x magnification',
+ subifds=pyramid[1:])
+ print(f"Pyramidal TIFF saved as {output_filename}")
+
+ # we use yield so that the temporarydirectory is still open in the tests
+ yield output_filename, images
+
+def test_gen_superpixel(create_sample_data):
+ global IMAGE_SIZE, COLOR_DIM
+ test_image_pth, test_images = create_sample_data
+ base = SuperpixelClassificationBase()
+
+ # Create test data
+ item = {
+ "_id": "test_item_id",
+ 'largeImage': {'fileId': 'test_image_id'},
+ 'name': test_image_pth,
+ }
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ if not os.path.exists(os.path.join(dst, test_image_pth)):
+ shutil.copy(test_image_pth, dst)
+ print(">>> Copied file from", test_image_pth, "to", dst)
+ return None
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ gc.getItem = MagicMock(return_value={'name': test_image_pth, 'largeImage': {'fileId': 'foobar'}})
+ def mv_to_src(_, src):
+ dst = os.path.dirname(test_image_pth)
+ if not os.path.exists(os.path.join(dst, src)):
+ shutil.copy(src, dst)
+ print(">>> Copied file from", src, "to", dst)
+ return {'itemId': 'uploaded_item_id'}
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'})
+ #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'})
+
+ #bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()]
+ bboxes = [[x, x, x + IMAGE_SIZE, x + IMAGE_SIZE] for x in range(0, NUM_IMAGES, IMAGE_SIZE)]
+
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items([item])
+ result = base.createSuperpixelsForItem(
+ gc=gc,
+ annotationName="TorchTest",
+ item=item,
+ radius=IMAGE_SIZE,
+ magnification=40,
+ annotationFolderId='annotation_folder_id',
+ userId="user_id",
+ prog=prog,
+ )
+
+ out_pixelmap_file = os.path.join(os.path.dirname(test_image_pth), '%s.pixelmap.tiff' % item['name'])
+ assert os.path.exists(out_pixelmap_file), f"Output file {out_pixelmap_file} does not exist"
+ x, y, x2, y2 = 0, 0, IMAGE_SIZE, IMAGE_SIZE
+ ts = large_image.getTileSource(test_image_pth)
+ orig_image = ts.getRegion(
+ region=dict(left=x, top=y, right=x2, bottom=y2),
+ format=large_image.tilesource.TILE_FORMAT_NUMPY
+ )[0]
+ # test that all values in orig_image is equal to 1
+ # TODO: waiting for another PR: want this to be 1
+ assert np.all(orig_image == 0)
+
+ feature_img = test_images[-1]
+ x, y, x2, y2 = IMAGE_SIZE * (IMAGE_SIZE - 1), IMAGE_SIZE * (IMAGE_SIZE - 1), IMAGE_SIZE * IMAGE_SIZE, IMAGE_SIZE * IMAGE_SIZE
+ ts = large_image.getTileSource(test_image_pth)
+ orig_image = ts.getRegion(
+ region=dict(left=x, top=y, right=x2, bottom=y2),
+ format=large_image.tilesource.TILE_FORMAT_NUMPY
+ )[0]
+ orig_image = orig_image.astype(feature_img.dtype)
+ # TODO: same as TODO above
+ assert np.all(orig_image == NUM_IMAGES - 1)
\ No newline at end of file
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_predict.py b/superpixel_classification/SuperpixelClassification/tests/test_predict.py
new file mode 100644
index 0000000..9341a90
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_predict.py
@@ -0,0 +1,254 @@
+import json
+import os
+import shutil
+import tempfile
+from unittest.mock import MagicMock
+
+import h5py
+import numpy as np
+import pytest
+import torch
+
+# make pythonpath work out of the box - although your editor may complain
+import sys
+import os
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from SuperpixelClassificationTorch import SuperpixelClassificationTorch, _BayesianPatchTorchModel
+from progress_helper import ProgressHelper
+from tests.validate_json_annotation import validate_json_file
+
+# currently, torch model only supports 100x100
+MNIST_IMAGE_SIZE=100
+COLOR_DIM = 3
+NUM_IMAGES = 64
+CUTOFF_IMAGES = 2
+
+@pytest.fixture(scope="session")
+def create_sample_data():
+ global NUM_IMAGES, CUTOFF_IMAGES
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ h5_path = os.path.join(tmpdirname, "test_data.h5")
+
+ images = np.random.randint(0, 255, size=(NUM_IMAGES - CUTOFF_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8)
+ indices = np.arange(NUM_IMAGES - CUTOFF_IMAGES)
+ assert images.shape[0] == indices.shape[0]
+
+ with h5py.File(h5_path, 'w') as f:
+ f.create_dataset('images', data=images)
+ f.create_dataset('used_indices', data=indices, dtype='i')
+
+ # we use yield so that the temporarydirectory is still open in the tests
+ yield h5_path
+
+'''
+This test checks to predictions on a dataset that is only labeled with two values of out ten categories.
+'''
+def test_subset_labels(create_sample_data):
+ global NUM_IMAGES, CUTOFF_IMAGES
+ h5_path = create_sample_data
+ base: SuperpixelClassificationBase = SuperpixelClassificationTorch()
+ base.certainty = 'batchbald'
+ base.feature_is_image = True
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ return shutil.copy(h5_path, dst)
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ gc.uploadFileToItem = MagicMock()
+
+ feature = {
+ '_id': '0',
+ 'name': 'my_test_feature'
+ }
+ labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ annotrec = {
+ 'annotation': {
+ 'attributes': {},
+ 'name': 'TorchTest',
+ },
+ }
+
+ # make a list with values 1 and 3 in a random order, and NUM_IMAGES entries
+ value_list = [1, 3] * (NUM_IMAGES // 2)
+
+ elem = {
+ "type": "pixelmap",
+ "girderId": "6838aab654f0ca783ff03871",
+ "transform": {"matrix": [[1.0, 0], [0, 1.0]]},
+ 'values': value_list,
+ 'categories' : [{"label": k, "fillColor": "rgba(0,0,0,0)"} for k in labels],
+ "boundaries": True,
+ "id": "myid",
+ 'user': { },
+ }
+
+ groups = { k: {"label": k, "fillColor": "rgba(0,0,0,0)", "strokeColor": "rgba(0,0,0,0)" } for k in labels }
+
+ device = torch.device("cpu")
+ model = _BayesianPatchTorchModel(len(labels), device)
+ model.device = device
+
+ items = [(feature, annotrec, elem)]
+ item = {'_id': 0, 'name': 'my_item', 'largeImage': {'fileId': 'test_image_id'}}
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items(items)
+
+ annotation_name = 'testannotation'
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ base.predictLabelsForItem(
+ gc=gc,
+ annotationName=annotation_name,
+ tempdir=tmpdirname,
+ model=model,
+ item=item,
+ annotrec=annotrec,
+ elem=elem,
+ feature=feature,
+ curEpoch=0,
+ userId='user_id',
+ labels=labels,
+ groups=groups,
+ makeHeatmaps=False,
+ radius=-1,
+ magnification=40.0,
+ certainty='batchbald',
+ batchSize=NUM_IMAGES,
+ use_cuda = False,
+ prog=prog,
+ )
+ out_pth = os.path.join(tmpdirname, '%s Epoch 0 Predictions.anot' % annotation_name)
+ assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth
+ with open(out_pth, 'r') as f:
+ pred_json = json.load(f)
+ e = pred_json['elements'][0]
+ assert len(e['values']) == NUM_IMAGES
+ for i in range(1, CUTOFF_IMAGES):
+ assert e['values'][-i] == 0, "Expected unknown/none label for cutoff images"
+ assert len(e['categories']) == len(labels)
+ assert len(e['user']['confidence']) == NUM_IMAGES
+ assert len(e['user']['categoryConfidence']) == NUM_IMAGES
+ assert len(e['user']['categoryConfidence'][0]) == len(labels)
+ assert len(e['user']['certainty']) == NUM_IMAGES
+ for i in range(1, CUTOFF_IMAGES):
+ assert e['user']['certainty'][-i] > 10000, "Expected certainty to be very high for unlabeled samples to ensure they occur last in the AL filmstrip (DSA)"
+ assert 'percentiles' in e['user']['certainty_info']
+ assert 'cdf' in e['user']['certainty_info']
+
+ validate_json_file(out_pth)
+
+ out_pth = os.path.join(tmpdirname, '%s Epoch 1.anot' % annotation_name)
+ assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth
+ with open(out_pth, 'r') as f:
+ annotation_file = json.load(f)
+ e = annotation_file['elements'][0]
+ assert len(e['values']) == NUM_IMAGES
+ assert len(e['categories']) == len(labels)
+
+ validate_json_file(out_pth)
+
+def test_predict_unlabeled_with_cutoff(create_sample_data):
+ global NUM_IMAGES, CUTOFF_IMAGES
+ h5_path = create_sample_data
+ base: SuperpixelClassificationBase = SuperpixelClassificationTorch()
+ base.certainty = 'batchbald'
+ base.feature_is_image = True
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ return shutil.copy(h5_path, dst)
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ gc.uploadFileToItem = MagicMock()
+
+ feature = {
+ '_id': '0',
+ 'name': 'my_test_feature'
+ }
+ labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ annotrec = {
+ 'annotation': {
+ 'attributes': {},
+ 'name': 'TorchTest',
+ },
+ }
+
+ elem = {
+ "type": "pixelmap",
+ "girderId": "6838aab654f0ca783ff03871",
+ "transform": {"matrix": [[1.0, 0], [0, 1.0]]},
+ 'values': [0] * NUM_IMAGES,
+ 'categories' : [{"label": k, "fillColor": "rgba(0,0,0,0)"} for k in labels],
+ "boundaries": True,
+ "id": "myid",
+ 'user': { },
+ }
+
+ groups = { k: {"label": k, "fillColor": "rgba(0,0,0,0)", "strokeColor": "rgba(0,0,0,0)" } for k in labels }
+
+ device = torch.device("cpu")
+ model = _BayesianPatchTorchModel(len(labels), device)
+ model.device = device
+
+ items = [(feature, annotrec, elem)]
+ item = {'_id': 0, 'name': 'my_item', 'largeImage': {'fileId': 'test_image_id'}}
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items(items)
+
+ annotation_name = 'testannotation'
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ base.predictLabelsForItem(
+ gc=gc,
+ annotationName=annotation_name,
+ tempdir=tmpdirname,
+ model=model,
+ item=item,
+ annotrec=annotrec,
+ elem=elem,
+ feature=feature,
+ curEpoch=0,
+ userId='user_id',
+ labels=labels,
+ groups=groups,
+ makeHeatmaps=False,
+ radius=-1,
+ magnification=40.0,
+ certainty='batchbald',
+ batchSize=NUM_IMAGES,
+ use_cuda = False,
+ prog=prog,
+ )
+ out_pth = os.path.join(tmpdirname, '%s Epoch 0 Predictions.anot' % annotation_name)
+ assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth
+ with open(out_pth, 'r') as f:
+ pred_json = json.load(f)
+ e = pred_json['elements'][0]
+ assert len(e['values']) == NUM_IMAGES
+ for i in range(1, CUTOFF_IMAGES):
+ assert e['values'][-i] == 0, "Expected unknown/none label for cutoff images"
+ assert len(e['categories']) == len(labels)
+ assert len(e['user']['confidence']) == NUM_IMAGES
+ assert len(e['user']['categoryConfidence']) == NUM_IMAGES
+ assert len(e['user']['categoryConfidence'][0]) == len(labels)
+ assert len(e['user']['certainty']) == NUM_IMAGES
+ for i in range(1, CUTOFF_IMAGES):
+ assert e['user']['certainty'][-i] > 10000, "Expected certainty to be very high for unlabeled samples to ensure they occur last in the AL filmstrip (DSA)"
+ assert 'percentiles' in e['user']['certainty_info']
+ assert 'cdf' in e['user']['certainty_info']
+
+ validate_json_file(out_pth)
+
+ out_pth = os.path.join(tmpdirname, '%s Epoch 1.anot' % annotation_name)
+ assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth
+ with open(out_pth, 'r') as f:
+ annotation_file = json.load(f)
+ e = annotation_file['elements'][0]
+ assert len(e['values']) == NUM_IMAGES
+ assert len(e['categories']) == len(labels)
+
+ validate_json_file(out_pth)
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py b/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py
new file mode 100644
index 0000000..1a40365
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py
@@ -0,0 +1,93 @@
+import os
+import shutil
+import tempfile
+from unittest.mock import MagicMock
+
+import h5py
+import numpy as np
+import pytest
+
+# make pythonpath work out of the box - although your editor may complain
+import sys
+import os
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from SuperpixelClassificationTensorflow import SuperpixelClassificationTensorflow
+from progress_helper import ProgressHelper
+
+MNIST_IMAGE_SIZE=28
+COLOR_DIM = 3
+NUM_IMAGES = 64
+
+@pytest.fixture(scope="session")
+def create_sample_data():
+ global NUM_IMAGES
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ h5_path = os.path.join(tmpdirname, "test_data.h5")
+ images = np.random.randint(0, 255, size=(NUM_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8)
+
+ with h5py.File(h5_path, 'w') as f:
+ f.create_dataset('images', data=images)
+ f.create_dataset('used_indices', data=np.arange(NUM_IMAGES - 2))
+
+ # we use yield so that that the temporarydirectory is still open in the tests
+ yield h5_path
+
+def test_train_model(create_sample_data):
+ global NUM_IMAGES
+ h5_path = create_sample_data
+ base: SuperpixelClassificationBase
+ base = SuperpixelClassificationTensorflow()
+ base.feature_is_image = True
+ base.certainty = 'not batchbald' # same as using tensorflow
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ return shutil.copy(h5_path, dst)
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(_, src):
+ dst = os.path.dirname(os.path.dirname(h5_path))
+ return shutil.copy(src, dst)
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ elem = {
+ 'girderId': 'test_girder_id',
+ 'categories': [
+ {"label": c} for c in labels
+ ],
+ 'values':
+ [] \
+ + np.random.randint(1, len(labels) - 1, size=(NUM_IMAGES - 2), dtype=np.uint8).tolist()
+ + [0, 0], # last two images unlabeled
+ 'transform': {'matrix': [[1.0]]}
+ }
+
+ item = {'_id': 'test_h5_file', 'name': 'test'}
+ annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}}
+ items = [(item, annotrec, elem)]
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items(items)
+ modelFile, modelTrainingFile = base.trainModel(
+ annotationName="TorchTest",
+ batchSize = 4,
+ epochs = 1,
+ excludeLabelList = [],
+ features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}},
+ gc=gc,
+ itemsAndAnnot=items,
+ labelList = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
+ modelFolderId="test_folder_id",
+ prog=prog,
+ randomInput = False,
+ trainingSplit = 0.5,
+ use_cuda = False,
+ )
+
+ assert os.path.exists(modelFile)
+ assert os.path.exists(modelTrainingFile)
\ No newline at end of file
diff --git a/superpixel_classification/SuperpixelClassification/tests/test_torch.py b/superpixel_classification/SuperpixelClassification/tests/test_torch.py
new file mode 100644
index 0000000..edb7dbc
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/test_torch.py
@@ -0,0 +1,94 @@
+import os
+import shutil
+import tempfile
+from unittest.mock import MagicMock
+
+import h5py
+import numpy as np
+import pytest
+
+# make pythonpath work out of the box - although your editor may complain
+import sys
+import os
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+
+from SuperpixelClassificationBase import SuperpixelClassificationBase
+from SuperpixelClassificationTorch import SuperpixelClassificationTorch
+from progress_helper import ProgressHelper
+
+# currently, torch model only supports 100x100
+MNIST_IMAGE_SIZE=100
+COLOR_DIM = 3
+NUM_IMAGES = 64
+
+@pytest.fixture(scope="session")
+def create_sample_data():
+ global NUM_IMAGES
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ h5_path = os.path.join(tmpdirname, "test_data.h5")
+ images = np.random.randint(0, 255, size=(NUM_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8)
+
+ with h5py.File(h5_path, 'w') as f:
+ f.create_dataset('images', data=images)
+ f.create_dataset('used_indices', data=np.arange(NUM_IMAGES - 2))
+
+ # we use yield so that that the temporarydirectory is still open in the tests
+ yield h5_path
+
+def test_train_model(create_sample_data):
+ global NUM_IMAGES
+ h5_path = create_sample_data
+ base: SuperpixelClassificationBase
+ base = SuperpixelClassificationTorch()
+ base.feature_is_image = True
+ base.certainty = 'batchbald' # same as using torch
+
+ # Mock girder client
+ gc = MagicMock()
+ def mv_to_dst(_, dst):
+ return shutil.copy(h5_path, dst)
+ gc.downloadFile = MagicMock(side_effect=mv_to_dst)
+ def mv_to_src(_, src):
+ dst = os.path.dirname(os.path.dirname(h5_path))
+ return shutil.copy(src, dst)
+ gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True)
+
+ labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+ elem = {
+ 'girderId': 'test_girder_id',
+ 'categories': [
+ {"label": c} for c in labels
+ ],
+ 'values':
+ [] \
+ + np.random.randint(1, len(labels) - 1, size=(NUM_IMAGES - 2), dtype=np.uint8).tolist()
+ + [0, 0], # last two images unlabeled
+ 'transform': {'matrix': [[1.0]]}
+ }
+
+ item = {'_id': 'test_h5_file', 'name': 'test'}
+ annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}}
+ items = [(item, annotrec, elem)]
+ with ProgressHelper( 'Superpixel Classification',
+ 'Test feature', False) as prog:
+ prog.progress(0)
+ prog.items(items)
+ modelFile, modelTrainingFile = base.trainModel(
+ annotationName="TorchTest",
+ batchSize = 4,
+ epochs = 1,
+ excludeLabelList = [],
+ features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}},
+ gc=gc,
+ itemsAndAnnot=items,
+ labelList = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
+ modelFolderId="test_folder_id",
+ prog=prog,
+ randomInput = False,
+ trainingSplit = 0.5,
+ use_cuda = True,
+ )
+
+ assert os.path.exists(modelFile)
+ assert os.path.exists(modelTrainingFile)
diff --git a/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py b/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py
new file mode 100644
index 0000000..5f209ce
--- /dev/null
+++ b/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py
@@ -0,0 +1,588 @@
+#!/usr/bin/env python
+'''
+This code is similar to girder_annotation/girder_large_image_annotation/models/annotation.py
+The meaning is to validate the json annotation file without having to use girder or large_image
+'''
+import argparse
+import json
+import logging
+import os
+import sys
+import jsonschema
+from tqdm import tqdm
+
+import copy
+
+def extendSchema(base, add):
+ extend = copy.deepcopy(base)
+ for key in add:
+ if key == 'required' and 'required' in base:
+ extend[key] = sorted(set(extend[key]) | set(add[key]))
+ elif key != 'properties' and 'properties' in base:
+ extend[key] = add[key]
+ if 'properties' in add:
+ extend['properties'].update(add['properties'])
+ return extend
+
+
+colorSchema = {
+ 'type': 'string',
+ # We accept colors of the form
+ # #rrggbb six digit RRGGBB hex
+ # #rgb three digit RGB hex
+ # #rrggbbaa eight digit RRGGBBAA hex
+ # #rgba four digit RGBA hex
+ # rgb(255, 255, 255) rgb decimal triplet
+ # rgba(255, 255, 255, 1) rgba quad with RGB in the range [0-255] and
+ # alpha [0-1]
+ 'pattern': r'^(#([0-9a-fA-F]{3,4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})|'
+ r'rgb\(\d+,\s*\d+,\s*\d+\)|'
+ r'rgba\(\d+,\s*\d+,\s*\d+,\s*(\d?\.|)\d+\))$',
+}
+
+transformArray = {
+ 'type': 'array',
+ 'items': {
+ 'type': 'array',
+ 'minItems': 2,
+ 'maxItems': 2,
+ },
+ 'minItems': 2,
+ 'maxItems': 2,
+ 'description': 'A 2D matrix representing the transform of an '
+ 'image overlay.',
+}
+
+
+colorRangeSchema = {
+ 'type': 'array',
+ 'items': colorSchema,
+ 'description': 'A list of colors',
+}
+
+rangeValueSchema = {
+ 'type': 'array',
+ 'items': {'type': 'number'},
+ 'description': 'A weakly monotonic list of range values',
+}
+
+userSchema = {
+ 'type': 'object',
+ 'additionalProperties': True,
+}
+
+labelSchema = {
+ 'type': 'object',
+ 'properties': {
+ 'value': {'type': 'string'},
+ 'visibility': {
+ 'type': 'string',
+ # TODO: change to True, False, None?
+ 'enum': ['hidden', 'always', 'onhover'],
+ },
+ 'fontSize': {
+ 'type': 'number',
+ 'exclusiveMinimum': 0,
+ },
+ 'color': colorSchema,
+ },
+ 'required': ['value'],
+ 'additionalProperties': False,
+}
+
+groupSchema = {'type': 'string'}
+
+baseElementSchema = {
+ 'type': 'object',
+ 'properties': {
+ 'id': {
+ 'type': 'string',
+ 'pattern': '^[0-9a-f]{24}$',
+ },
+ 'type': {'type': 'string'},
+ # schema free field for users to extend annotations
+ 'user': userSchema,
+ 'label': labelSchema,
+ 'group': groupSchema,
+ },
+ 'required': ['type'],
+ 'additionalProperties': True,
+}
+baseShapeSchema = extendSchema(baseElementSchema, {
+ 'properties': {
+ 'lineColor': colorSchema,
+ 'lineWidth': {
+ 'type': 'number',
+ 'minimum': 0,
+ },
+ },
+})
+
+
+pixelmapCategorySchema = {
+ 'type': 'object',
+ 'properties': {
+ 'fillColor': colorSchema,
+ 'strokeColor': colorSchema,
+ 'label': {
+ 'type': 'string',
+ 'description': 'A string representing the semantic '
+ 'meaning of regions of the map with '
+ 'the corresponding color.',
+ },
+ 'description': {
+ 'type': 'string',
+ 'description': 'A more detailed explanation of the '
+ 'meaining of this category.',
+ },
+ },
+ 'required': ['fillColor'],
+ 'additionalProperties': False,
+}
+
+_annotationSchema = {
+ 'type': 'object',
+ 'properties': {
+ 'value': colorSchema,
+ 'id': colorSchema,
+ 'label': {
+ 'type': 'string',
+ 'description': 'A string representing the semantic '
+ 'meaning of regions of the map with '
+ 'the corresponding color.',
+ },
+ 'description': {
+ 'type': 'string',
+ 'description': 'A more detailed explanation of the '
+ 'meaining of this category.',
+ },
+ },
+ 'required': ['fillColor'],
+ 'additionalProperties': False,
+}
+
+
+overlaySchema = extendSchema(baseElementSchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['image'],
+ },
+ 'girderId': {
+ 'type': 'string',
+ 'pattern': '^[0-9a-f]{24}$',
+ 'description': 'Girder item ID containing the image to '
+ 'overlay.',
+ },
+ 'opacity': {
+ 'type': 'number',
+ 'minimum': 0,
+ 'maximum': 1,
+ 'description': 'Default opacity for this image overlay. Must '
+ 'be between 0 and 1. Defaults to 1.',
+ },
+ 'hasAlpha': {
+ 'type': 'boolean',
+ 'description':
+ 'If true, the image is treated assuming it has an alpha '
+ 'channel.',
+ },
+ 'transform': {
+ 'type': 'object',
+ 'description': 'Specification for an affine transform of the '
+ 'image overlay. Includes a 2D transform matrix, '
+ 'an X offset and a Y offset.',
+ 'properties': {
+ 'xoffset': {
+ 'type': 'number',
+ },
+ 'yoffset': {
+ 'type': 'number',
+ },
+ 'matrix': transformArray,
+ },
+ },
+ },
+ 'required': ['girderId', 'type'],
+ 'additionalProperties': False,
+ 'description': 'An image overlay on top of the base resource.',
+})
+
+
+pixelmapSchema = extendSchema(overlaySchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['pixelmap'],
+ },
+ 'values': {
+ 'type': 'array',
+ 'items': {'type': 'integer'},
+ 'description': 'An array where the indices '
+ 'correspond to pixel values in the '
+ 'pixel map image and the values are '
+ 'used to look up the appropriate '
+ 'color in the categories property.',
+ },
+ 'categories': {
+ 'type': 'array',
+ 'items': pixelmapCategorySchema,
+ 'description': 'An array used to map between the '
+ 'values array and color values. '
+ 'Can also contain semantic '
+ 'information for color values.',
+ },
+ 'boundaries': {
+ 'type': 'boolean',
+ 'description': 'True if the pixelmap doubles pixel '
+ 'values such that even values are the '
+ 'fill and odd values the are stroke '
+ 'of each superpixel. If true, the '
+ 'length of the values array should be '
+ 'half of the maximum value in the '
+ 'pixelmap.',
+
+ },
+ },
+ 'required': ['values', 'categories', 'boundaries'],
+ 'additionalProperties': False,
+ 'description': 'A tiled pixelmap to overlay onto a base resource.',
+})
+
+bboxSchema = extendSchema(overlaySchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['bboxmap'],
+ },
+ 'categories': {
+ 'type': 'array',
+ 'items': pixelmapCategorySchema,
+ 'description': 'An array used to map between the '
+ 'values array and color values. '
+ 'Can also contain semantic '
+ 'information for color values.',
+ },
+ 'annotations': {
+ 'type': 'array',
+ 'description': 'Value, id, and bounding box for each annotation',
+ 'items': {
+ 'type': 'object',
+ 'additionalProperties': False,
+ 'properties': {
+ 'value': {
+ 'type': 'integer',
+ },
+ 'id': {
+ 'type': 'integer',
+ },
+ 'bbox': {
+ 'type': 'array',
+ 'items': {'type': 'number'},
+ 'minItems': 4,
+ 'maxItems': 4,
+ 'description': 'Bounding box in the form '
+ '[left, top, right, bottom].',
+ },
+ }
+ }
+ },
+ 'boundaries': {
+ 'type': 'boolean',
+ 'description': 'True if the pixelmap doubles pixel '
+ 'values such that even values are the '
+ 'fill and odd values the are stroke '
+ 'of each superpixel. If true, the '
+ 'length of the values array should be '
+ 'half of the maximum value in the '
+ 'pixelmap.',
+
+ },
+ },
+ 'required': ['categories', 'boundaries', 'annotations'],
+ 'additionalProperties': True,
+ 'description': 'A tiled pixelmap to overlay onto a base resource.',
+})
+
+annotationElementSchema = {
+ # Shape subtypes are mutually exclusive, so for efficiency, don't use
+ # 'oneOf'
+ 'anyOf': [
+ pixelmapSchema,
+ bboxSchema,
+ ],
+}
+
+
+class AnnotationSchema:
+ annotationSchema = {
+ '$schema': 'http://json-schema.org/schema#',
+ 'type': 'object',
+ 'properties': {
+ 'name': {
+ 'type': 'string',
+ # TODO: Disallow empty?
+ 'minLength': 1,
+ },
+ 'description': {'type': 'string'},
+ 'display': {
+ 'type': 'object',
+ 'properties': {
+ 'visible': {
+ 'type': ['boolean', 'string'],
+ 'enum': ['new', True, False],
+ 'description': 'This advises viewers on when the '
+ 'annotation should be shown. If "new" (the default), '
+ 'show the annotation when it is first added to the '
+ "system. If false, don't show the annotation by "
+ 'default. If true, show the annotation when the item '
+ 'is displayed.',
+ },
+ },
+ },
+ 'attributes': {
+ 'type': 'object',
+ 'additionalProperties': True,
+ 'title': 'Image Attributes',
+ 'description': 'Subjective things that apply to the entire '
+ 'image.',
+ },
+ 'elements': {
+ 'type': 'array',
+ 'items': annotationElementSchema,
+ # We want to ensure unique element IDs, if they are set. If
+ # they are not set, we assign them from Mongo.
+ 'title': 'Image Markup',
+ 'description': 'Subjective things that apply to a '
+ 'spatial region.',
+ },
+ },
+ 'additionalProperties': False,
+ }
+
+
+
+ coordSchema = {
+ 'type': 'array',
+ # TODO: validate that z==0 for now
+ 'items': {
+ 'type': 'number',
+ },
+ 'minItems': 3,
+ 'maxItems': 3,
+ 'name': 'Coordinate',
+ # TODO: define origin for 3D images
+ 'description': 'An X, Y, Z coordinate tuple, in base layer pixel '
+ 'coordinates, where the origin is the upper-left.',
+ }
+ coordValueSchema = {
+ 'type': 'array',
+ 'items': {
+ 'type': 'number',
+ },
+ 'minItems': 4,
+ 'maxItems': 4,
+ 'name': 'CoordinateWithValue',
+ 'description': 'An X, Y, Z, value coordinate tuple, in base layer '
+ 'pixel coordinates, where the origin is the upper-left.',
+ }
+
+ colorSchema = {
+ 'type': 'string',
+ # We accept colors of the form
+ # #rrggbb six digit RRGGBB hex
+ # #rgb three digit RGB hex
+ # #rrggbbaa eight digit RRGGBBAA hex
+ # #rgba four digit RGBA hex
+ # rgb(255, 255, 255) rgb decimal triplet
+ # rgba(255, 255, 255, 1) rgba quad with RGB in the range [0-255] and
+ # alpha [0-1]
+ # TODO: make rgb and rgba spec validate that rgb is [0-255] and a is
+ # [0-1], rather than just checking if they are digits and such.
+ 'pattern': r'^(#([0-9a-fA-F]{3,4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})|'
+ r'rgb\(\d+,\s*\d+,\s*\d+\)|'
+ r'rgba\(\d+,\s*\d+,\s*\d+,\s*(\d?\.|)\d+\))$',
+ }
+
+ colorRangeSchema = {
+ 'type': 'array',
+ 'items': colorSchema,
+ 'description': 'A list of colors',
+ }
+
+ rangeValueSchema = {
+ 'type': 'array',
+ 'items': {'type': 'number'},
+ 'description': 'A weakly monotonic list of range values',
+ }
+
+ userSchema = {
+ 'type': 'object',
+ 'additionalProperties': True,
+ }
+
+ labelSchema = {
+ 'type': 'object',
+ 'properties': {
+ 'value': {'type': 'string'},
+ 'visibility': {
+ 'type': 'string',
+ # TODO: change to True, False, None?
+ 'enum': ['hidden', 'always', 'onhover'],
+ },
+ 'fontSize': {
+ 'type': 'number',
+ 'exclusiveMinimum': 0,
+ },
+ 'color': colorSchema,
+ },
+ 'required': ['value'],
+ 'additionalProperties': False,
+ }
+
+ groupSchema = {'type': 'string'}
+
+ baseElementSchema = {
+ 'type': 'object',
+ 'properties': {
+ 'id': {
+ 'type': 'string',
+ 'pattern': '^[0-9a-f]{24}$',
+ },
+ 'type': {'type': 'string'},
+ # schema free field for users to extend annotations
+ 'user': userSchema,
+ 'label': labelSchema,
+ 'group': groupSchema,
+ },
+ 'required': ['type'],
+ 'additionalProperties': True,
+ }
+ baseShapeSchema = extendSchema(baseElementSchema, {
+ 'properties': {
+ 'lineColor': colorSchema,
+ 'lineWidth': {
+ 'type': 'number',
+ 'minimum': 0,
+ },
+ },
+ })
+
+ pointShapeSchema = extendSchema(baseShapeSchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['point'],
+ },
+ 'center': coordSchema,
+ 'fillColor': colorSchema,
+ },
+ 'required': ['type', 'center'],
+ 'additionalProperties': False,
+ })
+
+ arrowShapeSchema = extendSchema(baseShapeSchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['arrow'],
+ },
+ 'points': {
+ 'type': 'array',
+ 'items': coordSchema,
+ 'minItems': 2,
+ 'maxItems': 2,
+ },
+ 'fillColor': colorSchema,
+ },
+ 'description': 'The first point is the head of the arrow',
+ 'required': ['type', 'points'],
+ 'additionalProperties': False,
+ })
+
+ circleShapeSchema = extendSchema(baseShapeSchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['circle'],
+ },
+ 'center': coordSchema,
+ 'radius': {
+ 'type': 'number',
+ 'minimum': 0,
+ },
+ 'fillColor': colorSchema,
+ },
+ 'required': ['type', 'center', 'radius'],
+ 'additionalProperties': False,
+ })
+
+ polylineShapeSchema = extendSchema(baseShapeSchema, {
+ 'properties': {
+ 'type': {
+ 'type': 'string',
+ 'enum': ['polyline'],
+ },
+ 'points': {
+ 'type': 'array',
+ 'items': coordSchema,
+ 'minItems': 2,
+ },
+ 'fillColor': colorSchema,
+ 'closed': {
+ 'type': 'boolean',
+ 'description': 'polyline is open if closed flag is '
+ 'not specified',
+ },
+ 'holes': {
+ 'type': 'array',
+ 'description':
+ 'If closed is true, this is a list of polylines that are '
+ 'treated as holes in the base polygon. These should not '
+ 'cross each other and should be contained within the base '
+ 'polygon.',
+ 'items': {
+ 'type': 'array',
+ 'items': coordSchema,
+ 'minItems': 3,
+ },
+ },
+ },
+ 'required': ['type', 'points'],
+ 'additionalProperties': False,
+ })
+
+
+def validate_annotation(annotation_dict):
+ validator = jsonschema.Draft6Validator(AnnotationSchema.annotationSchema)
+ validatorElement = jsonschema.Draft6Validator(AnnotationSchema.baseElementSchema)
+
+ validator.validate(annotation_dict)
+ for element in tqdm(annotation_dict['elements']):
+ validatorElement.validate(element)
+
+def validate_json_file(json_dst):
+ with open(json_dst, 'r') as f:
+ data = json.load(f)
+ validate_annotation(data)
+ # num_elem = len(data['elements'][0]['annotations'])
+ # if num_elem % 4 != 0:
+ # raise ValueError(f"Number of elements ({num_elem}) is not a multiple of 4")
+ # num_values = len(data['elements'][0]['annotations'])
+ # if int(num_elem / 4) != num_values:
+ # raise ValueError(f"Number of elements ({num_elem / 4}) does not match values ({num_values})")
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='Validate a json annotation file')
+ parser.add_argument('--input', default=os.path.join("out", "superpixel.anot"), type=str,
+ help='Name of input json file with a pixelmap annotation"')
+ args = parser.parse_args()
+ # Call the function with the filenames
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+ if not os.path.exists(args.input):
+ logging.error(f"Annotation path {args.input} does not exist")
+ sys.exit(1)
+
+ validate_json_file(args.input)
+ logging.info("Done validating annotation ['%s']", args.input)
diff --git a/tools/inspect_image_feature_file.py b/tools/inspect_image_feature_file.py
new file mode 100644
index 0000000..a93d911
--- /dev/null
+++ b/tools/inspect_image_feature_file.py
@@ -0,0 +1,37 @@
+'''
+This script will open a feature file (.h5) and show a 3x3 grid of images.
+This tool is useful if you suspect that features are not extracted properly, for example due to erroneous mask values/indexing.
+'''
+
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+
+if len(sys.argv) > 0:
+ feature_file = sys.argv[1]
+else:
+ feature_file = "features.h5"
+
+# open the file
+with h5py.File(feature_file, "r") as f:
+ # get the images dataset
+ images = f["images"]
+ # get the first 9 images
+ images = images[:9]
+ # reshape the images to 3x3
+ #images = np.reshape(images, (3,3,100,100,3))
+ # transpose the images to 3x3
+ #images = np.transpose(images, (0,2,1,3,4))
+ # flatten the images to 9x100x100x3
+ #images = np.reshape(images, (9,100,100,3))
+
+ # hide axis from pyplot
+ plt.axis('off')
+
+ # plot the images
+ for i in range(9):
+ plt.subplot(3,3,i+1)
+ plt.imshow(images[i])
+ plt.show()
+ print(f"Image {i+1} is {images[i].shape}")