diff --git a/.dockerignore b/.dockerignore index 7797741..96a401f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ +**/tmp* +test_data .ruff_cache .tox *.egg-info diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3907571 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**/benchmark_results diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml index 38c7b77..4767aa9 100644 --- a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml +++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml @@ -61,7 +61,7 @@ Superpixel parameters gensuperpixels - generate-superpxiels + generate-superpixels If an image does not have an annotation with superpixels, generate one true @@ -100,6 +100,13 @@ true + + useCuda + usecuda + Whether or not to use GPU/cuda (true) or cpu (false). + + false + batchSize batchsize @@ -198,5 +205,12 @@ 4 The number of worker threads for superpixel and feature generation + + cutoff + cutoff + + 500 + Number of unannotated superpixels to use per slide for features, training and predictions + diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py index cd82ded..a9d1353 100644 --- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py +++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py @@ -204,7 +204,7 @@ def progCallback(step, count, total): print('Create superpixels for %s' % item['name']) imagePath = os.path.join(tempdir, item['name']) gc.downloadFile(item['largeImage']['fileId'], imagePath) - outImagePath = os.path.join(tempdir, 'superpixel.tiff') + outImagePath = os.path.join(tempdir, '%s.pixelmap.tiff' % item['name']) outAnnotationPath = os.path.join(tempdir, '%s.anot' % annotationName) if True: @@ -332,7 +332,7 @@ def createFeatureListFromPatchAndMaskList(self, patch_list, mask_list, maskvals_ ) return feature_list - def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patchSize, prog): + def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patchSize, prog, cutoff): import large_image print('Create feature', fileName) @@ -349,17 +349,35 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch gc.downloadFile(maskItem['largeImage']['fileId'], maskPath) tsMask = large_image.open(maskPath) + num_values = len(elem['values']) + labeled_samples = set([i for i, x in enumerate(elem['values']) if x > 0]) + # background is used if we have a bounding box of 1 pixel in top left corner that is unlabeled. We do not want to extract features for that + has_background = elem['user']['bbox'][:4] == [0,0,1,1] + start_index = 1 if has_background else 0 + unlabeled_samples = [i for i, x in enumerate(elem['values'][start_index:], start=start_index) if x == 0] + + if num_values - len(labeled_samples) > cutoff: + # only select a subset of unlabeled samples, i.e., prune the feature list + random.shuffle(unlabeled_samples) + unlabeled_samples = unlabeled_samples[:cutoff] + indices = list(sorted(list(labeled_samples) + unlabeled_samples)) + with h5py.File(filePath, 'w') as fptr: batch_size = 1024 # TODO: Is this the best value? - for batch_start in range(0, len(elem['values']), batch_size): - batch_list = elem['values'][batch_start: batch_start + batch_size] + total_size = len(indices) + for batch_start in range(0, total_size, batch_size): + #batch_list = elem['values'][batch_start: batch_start + batch_size] + batch_list = indices[batch_start: batch_start + batch_size] patch_list = [] mask_list = [] maskvals_list = [] - for idx, _ in enumerate(batch_list, start=batch_start): - prog.item_progress(item, 0.9 * idx / len(elem['values'])) - bbox = elem['user']['bbox'][idx * 4: idx * 4 + 4] + + for idx, i in enumerate(batch_list, start=batch_start): + prog.item_progress(item, 0.9 * idx / total_size) + bbox = elem['user']['bbox'][i * 4: i * 4 + 4] # use masked superpixel + if len(bbox) < 4: + pass patch = ts.getRegion( region=dict( left=int(bbox[0]), top=int(bbox[1]), @@ -384,7 +402,7 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch if mask.shape[2] == 4: mask = mask[:, :, :-1] maskvals = [[val % 256, val // 256 % 256, val // 65536 % 256] - for val in [idx * 2, idx * 2 + 1]] + for val in [(i + 1) * 2, (i + 1) * 2 + 1]] patch_list.append(patch) mask_list.append(mask) maskvals_list.append(maskvals) @@ -409,6 +427,8 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch (time.time() - starttime)), item['name']) del batch_list, patch_list, mask_list, maskvals_list, feature_list + used_indices_ds = fptr.create_dataset( + 'used_indices', data=np.array(indices), dtype='i') print(ds.shape, len(elem['values']), '%5.3f' % (time.time() - starttime), item['name']) prog.item_progress(item, 0.9) @@ -418,30 +438,38 @@ def createFeaturesForItem(self, gc, item, elem, featureFolderId, fileName, patch prog.item_progress(item, 1) return file - def createFeatures(self, gc, folderId, annotationName, featureFolderId, patchSize, numWorkers, - prog): - itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName) + def createFeatures(self, gc, folderId, annotationName, itemsAndAnnot, featureFolderId, patchSize, numWorkers, + prog, cutoff): + # itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName) prog.message('Creating features') prog.progress(0) prog.items([item for item, _, _ in itemsAndAnnot]) results = {} futures = [] + featureFiles = [ + f for item in gc.listItem(featureFolderId) for f in gc.listFile(item['_id']) + ] with concurrent.futures.ThreadPoolExecutor(max_workers=numWorkers) as executor: for item, _, elem in itemsAndAnnot: - bbox = elem['user']['bbox'] - hashval = repr(dict( - itemId=item['_id'], bbox=[int(v) for v in bbox], patchSize=patchSize)) - hashval = hashlib.new('sha256', hashval.encode()).hexdigest() - fileName = 'feature-%s.h5' % (hashval) - found = False - for existing in gc.listItem(featureFolderId, name=fileName): - results[item['_id']] = next(gc.listFile(existing['_id'], limit=1)) - found = True - break - if not found: - futures.append((item, executor.submit( - self.createFeaturesForItem, gc, item, elem, featureFolderId, fileName, - patchSize, prog))) + match = [ + f for f in featureFiles if + re.match('^%s.*[.]feature.h5$' % re.escape(item['name']), f['name']) + ] + if len(match): + results[item['_id']] = match[0] + else: # fallback to hash-based naming - generate features if necessary + bbox = elem['user']['bbox'] + hashval = repr(dict( + itemId=item['_id'], bbox=[int(v) for v in bbox], patchSize=patchSize)) + hashval = hashlib.new('sha256', hashval.encode()).hexdigest() + fileName = 'feature-%s.h5' % (hashval) + match = [f for f in featureFiles if f['name'] == fileName] + if len(match): + results[item['_id']] = match[0] + else: + futures.append((item, executor.submit( + self.createFeaturesForItem, gc, item, elem, featureFolderId, + '%s.feature.h5' % (item['name']), patchSize, prog, cutoff))) for item, future in futures: file = future.result() try: @@ -461,12 +489,20 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature, item['name'], annotrec['annotation']['name'], annotrec['_id'], annotrec['_version'])) featurePath = os.path.join(record['tempdir'], feature['name']) gc.downloadFile(feature['_id'], featurePath) + print(f"Downloaded '{feature['_id']}' to '{featurePath}'") with h5py.File(featurePath, 'r') as ffptr: fds = ffptr['images'] - for idx, labelnum in enumerate(elem['values']): - if labelnum and labelnum < len(elem['categories']): + if 'used_indices' in ffptr: + indices = ffptr['used_indices'] + else: + indices = range(len(elem['values'])) + skipped_excluded = 0 + for i,idx in enumerate(indices): + labelnum = elem['values'][idx] + if 0 < labelnum < len(elem['categories']): labelname = elem['categories'][labelnum]['label'] if labelname in excludeLabelList: + skipped_excluded += 1 continue if labelname not in record['groups']: record['groups'][labelname] = elem['categories'][labelnum] @@ -475,7 +511,7 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature, labelname = labelList[labelnum - 1] else: continue - patch = fds[idx] + patch = fds[i] if not record['ds']: record['ds'] = record['fptr'].create_dataset( 'images', (1,) + patch.shape, maxshape=(None,) + patch.shape, @@ -494,11 +530,11 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature, record['lastlog'] = time.time() print(record['ds'].shape, record['counts'], '%5.3f' % (time.time() - record['starttime'])) + print(f"Skipped {skipped_excluded} samples with labels that were excluded") - def trainModel(self, gc, folderId, annotationName, features, modelFolderId, + def trainModel(self, gc, annotationName, itemsAndAnnot, features, modelFolderId, batchSize, epochs, trainingSplit, randomInput, labelList, - excludeLabelList, prog): - itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName) + excludeLabelList, use_cuda, prog): with tempfile.TemporaryDirectory(dir=os.getcwd()) as tempdir: trainingPath = os.path.join(tempdir, 'training.h5') with h5py.File(trainingPath, 'w') as fptr: @@ -526,7 +562,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId, prog.progress(1) if not record['ds']: print('No labeled data') - return + return None, None record['labelds'] = fptr.create_dataset( 'labels', (len(record['labelvals']),), dtype=int) record['labelds'] = np.array(record['labelvals'], dtype=int) @@ -536,7 +572,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId, prog.progress(0) history, modelPath = self.trainModelDetails( record, annotationName, batchSize, epochs, itemsAndAnnot, prog, tempdir, - trainingSplit) + trainingSplit, use_cuda) modTrainingPath = os.path.join(tempdir, '%s ModTraining Epoch %d.h5' % ( annotationName, self.getCurrentEpoch(itemsAndAnnot))) @@ -551,16 +587,16 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId, for attempt in tenacity.Retrying(stop=tenacity.stop_after_attempt(self.uploadRetries)): with attempt: modelFile = gc.uploadFileToFolder(modelFolderId, modelPath) - print('Saved model') + print(f'Saved model to {modelFolderId}') for attempt in tenacity.Retrying(stop=tenacity.stop_after_attempt(self.uploadRetries)): with attempt: modTrainingFile = gc.uploadFileToFolder(modelFolderId, modTrainingPath) - print('Saved modTraining') + print(f'Saved modTraining to {modelFolderId}') return modelFile, modTrainingFile - def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, model, item, + def predictLabelsForItem(self, gc, annotationName, tempdir, model, item, annotrec, elem, feature, curEpoch, userId, labels, groups, - makeHeatmaps, radius, magnification, certainty, batchSize, prog): + makeHeatmaps, radius, magnification, certainty, batchSize, use_cuda, prog): import al_bench.factory print('Predicting %s' % (item['name'])) @@ -571,6 +607,8 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, # Figure out which samples are already labeled labeled_samples: NDArray[np.int_] = np.nonzero(np.array(elem['values'])) + number_annotations = len(elem['values']) + tiny = np.finfo(np.float32).tiny print(f'{labeled_samples = }') print(f'certainty_type = {certainty!r}') @@ -581,9 +619,17 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, # In case we are computing batchbald compCertainty.set_batchbald_num_samples(16) compCertainty.set_batchbald_batch_size(100) - compCertainty.set_batchbald_excluded_samples(labeled_samples) + #compCertainty.set_batchbald_excluded_samples(labeled_samples) with h5py.File(featurePath, 'r') as ffptr: + if 'used_indices' in ffptr: + used_indices = set(list(ffptr['used_indices'])) + else: + used_indices = set(range(number_annotations)) + all_indices = set(range(number_annotations)) + unused_indices = list(sorted(all_indices.difference(used_indices))) + compCertainty.set_batchbald_excluded_samples(np.array(unused_indices)) + prog.item_progress(item, 0) # Create predicted annotation annot = copy.deepcopy(annotrec) @@ -592,21 +638,29 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, annot['elements'][0]['categories'] = [groups[key] for key in labels] ds = ffptr['images'] prog.item_progress(item, 0.05) - catWeights, predictions = self.predictLabelsForItemDetails( - batchSize, ds, item, model, prog) - catWeights = np.array(catWeights) - predictions = np.array(predictions) + _catWeights, _predictions, indices = self.predictLabelsForItemDetails( + batchSize, ds, np.array(list(used_indices), dtype=np.int64), item, model, use_cuda, prog) + # expand catWeights and predictions to be length of elem['values'] instead of just `cutoff` samples + # then copy in results from predictions + catWeights = np.zeros((number_annotations,) + _catWeights.shape[1:], dtype=np.float32 if str(_catWeights.dtype).endswith("32") else np.float64) + predictions = np.zeros((number_annotations,) + _predictions.shape[1:], dtype=np.float32 if str(_predictions.dtype).endswith("32") else np.float64) + for cw,p,idx in zip(_catWeights, _predictions, indices): + catWeights[idx] = cw + predictions[idx] = p + print_fully('predictions', predictions) prog.item_progress(item, 0.7) # compCertainty needs catWeights to have shape (num_superpixels, # bayesian_samples, num_classes) if 'batchbald' is selected, otherwise the # shape should be (num_superpixels, num_classes). - print_fully('catWeights', catWeights) # Ask compCertainty to compute certainties - cert = compCertainty.from_numpy_array(catWeights) + cert = compCertainty.from_numpy_array(catWeights + tiny) + print_fully('catWeights', catWeights) + # After the call to compCertainty, those numbers that end up as values for # annot's keys 'values', 'confidence', 'categoryConfidence', and 'certainty' # should have shape (num_superpixels, num_classes). + print_fully('cert', cert) scores = cert[certainty]['scores'] print_fully('scores', scores) @@ -617,14 +671,28 @@ def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, epsilon = 1e-50 predictions = np.log(catWeights + epsilon) cats = np.argmax(catWeights, axis=-1) - indices = np.arange(cats.shape[0]) - conf = catWeights[indices, cats[indices]] + # 0 means we didn't make a prediction, so increment by one + #cats[indices] += 1 + conf = catWeights[list(all_indices), cats[np.arange(cats.shape[0])]] print_fully('cats', cats) print_fully('conf', conf) + # give unused_indices the highest possible confidence so that they show up last in the active learning UI + # (because it sorts by confidence in descending order) + scores[unused_indices] = np.finfo(scores.dtype).max + # additionally, ensure that labels that are already labeled also end up last or late in the recommendations + # for the DSA UI, this prevents labeled samples from being shown again to the user + scores[labeled_samples] = np.finfo(scores.dtype).max + + # additionally, ensure that labels that are already labeled also end up last or late in the recommendations + # for the DSA UI, this prevents labeled samples from being shown again to the user + scores[labeled_samples] = np.finfo(scores.dtype).max + cats = cats.tolist() conf = conf.tolist() - # Should this be from predictions for from catWeights?!!! + + # Should this be from predictions or from catWeights?!!! + predictions[np.isneginf(predictions)] = np.finfo(predictions.dtype).min catConf = predictions.tolist() scores = scores.tolist() annot['elements'][0]['values'] = cats @@ -761,10 +829,10 @@ def makeHeatmapsForItem(self, gc, annotationName, userId, tempdir, radius, item, 'fileId': item['largeImage']['fileId'], 'userId': userId})) - def predictLabels(self, gc, folderId, annotationName, features, modelFolderId, + def predictLabels(self, gc, folderId, annotationName, itemsAndAnnot, features, modelFolderId, annotationFolderId, saliencyMaps, radius, magnification, - certainty, batchSize, prog): - itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName) + certainty, batchSize, use_cuda, prog): + #itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName) curEpoch = self.getCurrentEpoch(itemsAndAnnot) folder = gc.getFolder(folderId) userId = folder['creatorId'] @@ -779,7 +847,7 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId, modelFile = next(gc.listFile(item['_id'], limit=1)) break if not modelFile: - print('No model file found') + print(f'No model file found in {modelFolderId}') return print(modelFile['name'], item) modelPath = os.path.join(tempdir, modelFile['name']) @@ -792,7 +860,7 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId, modTrainingFile = next(gc.listFile(item['_id'], limit=1)) break if not modTrainingFile: - print('No modTraining file found') + print(f'No modTraining file found in {modelFolderId}') return print(modTrainingFile['name'], item) modTrainingPath = os.path.join(tempdir, modTrainingFile['name']) @@ -823,20 +891,26 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId, if item['_id'] not in features: continue self.predictLabelsForItem( - gc, annotationName, annotationFolderId, tempdir, model, item, annotrec, elem, + gc, annotationName, tempdir, model, item, annotrec, elem, features.get(item['_id']), curEpoch, userId, labels, groups, saliencyMaps, - radius, magnification, certainty, batchSize, prog) + radius, magnification, certainty, batchSize, use_cuda, prog) prog.progress(1) - def main(self, args): + def main(self, args, gc = None): self.feature_is_image = args.feature != 'vector' self.certainty = args.certainty print('\n>> CLI Parameters ...\n') pprint.pprint(vars(args)) - gc = girder_client.GirderClient(apiUrl=args.girderApiUrl) - gc.token = args.girderToken + if gc is None: + gc = girder_client.GirderClient(apiUrl=args.girderApiUrl) + gc.token = args.girderToken + gc.authenticate('admin', 'password') + + # check to make sure we have access to server + if not [x for x in list(gc.listCollection()) if x['name'] == 'Active Learning']: + raise Exception("Unable to authenticate with girder") with ProgressHelper( 'Superpixel Classification', 'Superpixel classification', args.progress) as prog: @@ -845,16 +919,24 @@ def main(self, args): gc, args.images, args.annotationName, args.radius, args.magnification, args.annotationDir, args.numWorkers, prog) + itemsAndAnnot = self.getItemsAndAnnotations(gc, args.images, args.annotationName) + print("Creating features...") features = self.createFeatures( - gc, args.images, args.annotationName, args.features, args.patchSize, - args.numWorkers, prog) + gc, args.images, args.annotationName, itemsAndAnnot, args.features, args.patchSize, + args.numWorkers, prog, args.cutoff) + print("Done creating features...") if args.train: + print("Training...") self.trainModel( - gc, args.images, args.annotationName, features, args.modeldir, args.batchSize, - args.epochs, args.split, args.randominput, args.labels, args.exclude, prog) + gc, args.annotationName, itemsAndAnnot, features, args.modeldir, args.batchSize, + args.epochs, args.split, args.randominput, args.labels, args.exclude, args.useCuda, prog) + print("Done training...") + print("Predicting labels...") self.predictLabels( - gc, args.images, args.annotationName, features, args.modeldir, args.annotationDir, - args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize, + gc, args.images, args.annotationName, itemsAndAnnot, features, args.modeldir, args.annotationDir, + args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize, args.useCuda, prog) + print("Done predicting labels...") + print("Done, exiting") diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py index 0af02d8..e50cd8a 100644 --- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py +++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py @@ -3,6 +3,7 @@ from typing import Optional import h5py +import numpy as np import tensorflow as tf from SuperpixelClassificationBase import SuperpixelClassificationBase @@ -35,33 +36,56 @@ class SuperpixelClassificationTensorflow(SuperpixelClassificationBase): def __init__(self): self.training_optimal_batchsize: Optional[int] = None self.prediction_optimal_batchsize: Optional[int] = None + self.use_cuda = False def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndAnnot, prog, - tempdir, trainingSplit): - # print(f'Tensorflow trainModelDetails(batchSize={batchSize}, ...)') - # make model - num_classes = len(record['labels']) - model = tf.keras.Sequential([ - tf.keras.layers.Rescaling(1.0 / 255), - tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Flatten(), - # tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dense(num_classes)]) - prog.progress(0.2) - model.compile(optimizer='adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - metrics=['accuracy']) + tempdir, trainingSplit, use_cuda): + self.use_cuda = use_cuda + + # Enable GPU memory growth globally to avoid precondition errors + gpus = tf.config.list_physical_devices('GPU') + if gpus and self.use_cuda: + try: + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + except RuntimeError as e: + print(f"Could not set memory growth: {e}") + if not self.use_cuda: + tf.config.set_visible_devices([], 'GPU') + device = "gpu" if use_cuda else "cpu" + print(f"Using device: {device}") + + # Dataset preparation (outside strategy scope) + ds_h5 = record['ds'] + labelds_h5 = record['labelds'] + # Fully load to memory and break h5py reference + ds_numpy = np.array(ds_h5[:]) + labelds_numpy = np.array(labelds_h5[:]) + + strategy = tf.distribute.MirroredStrategy() + with strategy.scope(): + num_classes = len(record['labels']) + model = tf.keras.Sequential([ + tf.keras.layers.Rescaling(1.0 / 255), + tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dense(num_classes)]) + prog.progress(0.2) + model.compile(optimizer='adam', + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=['accuracy']) + prog.progress(0.7) - # generate split - full_ds = tf.data.Dataset.from_tensor_slices((record['ds'], record['labelds'])) - full_ds = full_ds.shuffle(1000) # add seed=123 ? - count = len(full_ds) + # generate split using numpy arrays + full_ds = tf.data.Dataset.from_tensor_slices((ds_numpy, labelds_numpy)) + full_ds = full_ds.shuffle(1000) + count = len(ds_numpy) train_size = int(count * trainingSplit) if batchSize < 1: batchSize = self.findOptimalBatchSize(model, full_ds, training=True) @@ -85,24 +109,53 @@ def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndA self.saveModel(model, modelPath) return history, modelPath + def _get_device(self, use_cuda): + if tf.config.list_physical_devices('GPU') and use_cuda: + return '/GPU:0' + return '/CPU:0' + def predictLabelsForItemDetails( - self, batchSize, ds: h5py._hl.dataset.Dataset, item, model, prog, + self, batchSize, ds: h5py._hl.dataset.Dataset, indices, item, model, use_cuda, prog, ): - # print(f'Tensorflow predictLabelsForItemDetails(batchSize={batchSize}, ...)') if batchSize < 1: batchSize = self.findOptimalBatchSize( model, tf.data.Dataset.from_tensor_slices(ds), training=False, ) print(f'Optimal batch size for prediction = {batchSize}') - predictions = model.predict( - ds, - batch_size=batchSize, - callbacks=[_LogTensorflowProgress( - prog, (ds.shape[0] + batchSize - 1) // batchSize, 0.05, 0.35, item)]) - prog.item_progress(item, 0.4) - # softmax to scale to 0 to 1 - catWeights = tf.nn.softmax(predictions) - return catWeights, predictions + + device = self._get_device(use_cuda) + with tf.device(device): + # Create a dataset that pairs the data with their indices + dataset = tf.data.Dataset.from_tensor_slices((ds, indices)) + dataset = dataset.batch(batchSize) + + # Initialize arrays to store results + all_predictions = [] + all_cat_weights = [] + all_indices = [] + + # Iterate through batches manually to keep track of indices + for data, batch_indices in dataset: + batch_predictions = model.predict( + data, + batch_size=batchSize, + verbose=0) # Set verbose=0 to avoid multiple progress bars + + # Apply softmax to scale to 0 to 1 + batch_cat_weights = tf.nn.softmax(batch_predictions) + + all_predictions.append(batch_predictions) + all_cat_weights.append(batch_cat_weights) + all_indices.append(batch_indices) + + prog.item_progress(item, 0.4) + + # Concatenate all results + predictions = tf.concat(all_predictions, axis=0) + catWeights = tf.concat(all_cat_weights, axis=0) + final_indices = tf.concat(all_indices, axis=0) + + return catWeights.numpy(), predictions.numpy(), final_indices.numpy().astype(np.int64) def findOptimalBatchSize(self, model, ds, training) -> int: if training and self.training_optimal_batchsize is not None: diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py index e06d247..e8acb68 100644 --- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py +++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py @@ -66,12 +66,10 @@ class _BayesianPatchTorchModel(bbald.consistent_mc_dropout.BayesianModule): # A Bayesian model that takes patches (2-dimensional shape) rather than vectors # (1-dimensional shape) as input. It is useful when feature != 'vector' and # SuperpixelClassificationBase.certainty == 'batchbald'. - def __init__(self, num_classes: int) -> None: + def __init__(self, num_classes: int, device : torch.device) -> None: # Set `self.device` as early as possible so that other code does not lock out # what we want. - self.device: str = torch.device( - ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'), - ) + self.device : torch.device = device # print(f'Initial model.device = {self.device}') super(_BayesianPatchTorchModel, self).__init__() @@ -134,18 +132,16 @@ class _VectorTorchModel(torch.nn.Module): # (2-dimensional shape) as input. It is useful when feature == 'vector' and # SuperpixelClassificationBase.certainty != 'batchbald'. - def __init__(self, input_dim: int, num_classes: int) -> None: + def __init__(self, input_dim: int, num_classes: int, device : torch.device) -> None: # Set `self.device` as early as possible so that other code does not lock out # what we want. - self.device: str = torch.device( - ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'), - ) + self.device: torch.device = device # print(f'Initial model.device = {self.device}') super(_VectorTorchModel, self).__init__() self.input_dim: int = input_dim self.num_classes: int = num_classes - self.fc: torch.Module = torch.nn.Linear(input_dim, num_classes) + self.fc: torch.Linear = torch.nn.Linear(input_dim, num_classes) def forward(self, input: torch.Tensor) -> torch.Tensor: # TODO: Is torch.mul appropriate here? @@ -161,20 +157,18 @@ class _BayesianVectorTorchModel(bbald.consistent_mc_dropout.BayesianModule): # (2-dimensional shape) as input. It is useful when feature == 'vector' and # SuperpixelClassificationBase.certainty == 'batchbald'. - def __init__(self, input_dim: int, num_classes: int) -> None: + def __init__(self, input_dim: int, num_classes: int, device : torch.device) -> None: # Set `self.device` as early as possible so that other code does not lock out # what we want. - self.device: str = torch.device( - ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'), - ) + self.device = device # print(f'Initial model.device = {self.device}') super(_BayesianVectorTorchModel, self).__init__() self.input_dim: int = input_dim self.num_classes: int = num_classes self.bayesian_samples: int = 12 - self.fc: torch.Module = torch.nn.Linear(input_dim, num_classes) - self.fc_drop: torch.Module = bbald.consistent_mc_dropout.ConsistentMCDropout() + self.fc: torch.Linear = torch.nn.Linear(input_dim, num_classes) + self.fc_drop: torch.ConsistentMCDropout = bbald.consistent_mc_dropout.ConsistentMCDropout() def mc_forward_impl(self, input: torch.Tensor) -> torch.Tensor: # TODO: Is torch.mul appropriate here? @@ -311,14 +305,17 @@ def trainModelDetails( prog: ProgressHelper, tempdir: str, trainingSplit: float, + cuda : bool, ): + device = torch.device("cuda" if cuda else "cpu") + print(f"Using device: {device}") # make model num_classes: int = len(record['labels']) model: torch.nn.Module if self.feature_is_image: # Feature is patch if self.certainty == 'batchbald': - model = _BayesianPatchTorchModel(num_classes) + model = _BayesianPatchTorchModel(num_classes, device) else: mesg = 'Expected torch model for input of type image to be Bayesian' raise ValueError(mesg) @@ -326,9 +323,9 @@ def trainModelDetails( # Feature is vector input_dim: int = record['ds'].shape[1] if self.certainty == 'batchbald': - model = _BayesianVectorTorchModel(input_dim, num_classes) + model = _BayesianVectorTorchModel(input_dim, num_classes, device) else: - model = _VectorTorchModel(input_dim, num_classes) + model = _VectorTorchModel(input_dim, num_classes, device) model.to(model.device) # print(f'Torch trainModelDetails(batchSize={batchSize}, ...)') @@ -348,6 +345,7 @@ def trainModelDetails( val_ds: torch.utils.data.TensorDataset train_dl: torch.utils.data.DataLoader val_dl: torch.utils.data.DataLoader + prog.message('Loading features for model training') train_arg1 = ( torch.from_numpy(record['ds'][train_indices].transpose((0, 3, 2, 1))) if self.feature_is_image @@ -507,7 +505,7 @@ def fitModel( return history def predictLabelsForItemDetails( - self, batchSize: int, ds_h5, item, model: torch.nn.Module, prog: ProgressHelper, + self, batchSize: int, ds_h5, indices, item, model: torch.nn.Module, use_cuda : bool, prog: ProgressHelper, ): # print(f'Torch predictLabelsForItemDetails(batchSize={batchSize}, ...)') num_superpixels: int = ds_h5.shape[0] @@ -517,6 +515,9 @@ def predictLabelsForItemDetails( num_classes: int = model.num_classes # print(f'{num_classes = }') + # also set on model.device, ideally + #device = torch.device("cuda" if use_cuda else "cpu") + callbacks = [ _LogTorchProgress(prog, 1 + (num_superpixels - 1) // batchSize, 0.05, 0.35, item), ] @@ -532,12 +533,13 @@ def predictLabelsForItemDetails( for cb in callbacks: cb.on_predict_begin(logs=logs) + # ds also needs to have information about the indices so that we can shuffle the data but still link it to an index ds: torch.utils.data.TensorDataset = torch.utils.data.TensorDataset( ( torch.from_numpy(np.array(ds_h5).transpose((0, 3, 2, 1))) if self.feature_is_image else torch.from_numpy(np.array(ds_h5)) - ), + ), torch.from_numpy(indices), ) if batchSize < 1: batchSize = self.findOptimalBatchSize(model, ds, training=False) @@ -545,6 +547,7 @@ def predictLabelsForItemDetails( dl: torch.utils.data.DataLoader = torch.utils.data.DataLoader(ds, batch_size=batchSize) predictions: NDArray[np.float_] = np.zeros((num_superpixels, bayesian_samples, num_classes)) catWeights: NDArray[np.float_] = np.zeros((num_superpixels, bayesian_samples, num_classes)) + outIndices: NDArray[np.int64] = np.zeros(num_superpixels, dtype=np.int64) with torch.no_grad(): model.eval() # Tell torch that we will be doing predictions row: int = 0 @@ -567,6 +570,8 @@ def predictLabelsForItemDetails( catWeights_raw = torch.nn.functional.softmax(predictions_raw, dim=-1) predictions[row:new_row, :, :] = predictions_raw.detach().cpu().numpy() catWeights[row:new_row, :, :] = catWeights_raw.detach().cpu().numpy() + outIndices[row:new_row] = data[1].detach().cpu().numpy().astype(np.int64)[:] + row = new_row for cb in callbacks: cb.on_predict_batch_end(i) @@ -574,7 +579,7 @@ def predictLabelsForItemDetails( cb.on_predict_end({'outputs': predictions}) prog.item_progress(item, 0.4) # scale to units - return catWeights, predictions + return catWeights, predictions, outIndices def findOptimalBatchSize( self, model: torch.nn.Module, ds: torch.utils.data.TensorDataset, training: bool, @@ -651,9 +656,14 @@ def add_safe_globals(self): def loadModel(self, modelPath): self.add_safe_globals() - model = torch.load(modelPath) - model.eval() - return model + try: + model = torch.load(modelPath, weights_only=False) + model.eval() + return model + except Exception as e: + print(f"Unable to load {modelPath}") + raise + def saveModel(self, model, modelPath): self.add_safe_globals() diff --git a/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py b/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py new file mode 100644 index 0000000..617ae86 --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/benchmarks/benchmark_torch.py @@ -0,0 +1,193 @@ +''' Benchmark script for the SuperpixelClassificationTorch class +Originally written by feeding "tests/test_torch.py" to ChatGPT and asking for a benchmarking using timeit. +''' +import shutil +import numpy as np +import h5py +import os +import tempfile +import timeit +from unittest.mock import MagicMock +import csv +import matplotlib.pyplot as plt +from datetime import datetime + +from IPython.utils.path import ensure_dir_exists +from more_itertools.more import side_effect +from superpixel_classification.SuperpixelClassification.SuperpixelClassificationBase import SuperpixelClassificationBase +from superpixel_classification.SuperpixelClassification.SuperpixelClassificationTorch import SuperpixelClassificationTorch +from superpixel_classification.SuperpixelClassification.progress_helper import ProgressHelper + +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark SuperpixelClassificationTorch.") + parser.add_argument('--mnist-image-size', type=int, default=100, help='patchsize of individual images') + parser.add_argument('--color-dim', type=int, default=3, help='Number of color channels') + parser.add_argument('--image-sizes', default=list(map(int, [1e3, 1e4])), help='Output path for the pyramidal TIF file') + parser.add_argument('--epochs', default=3, type=int, help='Number of epochs to train') + parser.add_argument('--out-dir', default='benchmark_results', type=str, help='default output directory for benchmark results') + + return parser.parse_args() + + +def create_sample_data(num_images, tmpdir, image_size, color_dim): + h5_path = os.path.join(tmpdir, "test_data.h5") + images = np.random.randint(0, 255, size=(num_images, image_size, image_size, color_dim), dtype=np.uint8) + + with h5py.File(h5_path, 'w') as f: + f.create_dataset('images', data=images) + f.create_dataset('used_indices', data=np.arange(num_images - 2)) + + return h5_path + +def train_model(num_images, num_epochs, h5_path): + base: SuperpixelClassificationBase = SuperpixelClassificationTorch() + base.feature_is_image = True + base.certainty = 'batchbald' + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + return shutil.copy(h5_path, dst) + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(_, src): + dst = os.path.dirname(os.path.dirname(h5_path)) + return shutil.copy(src, dst) + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + elem = { + 'girderId': 'test_girder_id', + 'categories': [ + {"label": c} for c in labels + ], + 'values': + [] \ + + np.random.randint(1, len(labels) - 1, size=(num_images - 2), dtype=np.uint8).tolist() + + [0, 0], # last two images unlabeled + 'transform': {'matrix': [[1.0]]} + } + + item = {'_id': 'test_h5_file', 'name': 'test'} + annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}} + items = [(item, annotrec, elem)] + + with ProgressHelper('Superpixel Classification', 'Test training', True) as prog: + prog.progress(0) + prog.items(items) + modelFile, modelTrainingFile = base.trainModel( + gc=gc, + annotationName="TorchTest", + itemsAndAnnot=items, + features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}}, + modelFolderId="test_folder_id", + batchSize=4, + epochs=1, + trainingSplit=0.5, + randomInput=False, + labelList='', + excludeLabelList=[], + prog=prog, + use_cuda=True, + ) + + return modelFile, modelTrainingFile + +def create_benchmark_plot(results, out_dir): + plt.figure(figsize=(12, 6)) + + # Number of image sizes and runs + n_sizes = len(results) + n_runs = len(results[0]['times']) + + # Create positions for bars + ind = np.arange(n_sizes) + width = 0.25 # Width of bars + + # Plot bars for each run + for i in range(n_runs): + times = [result['times'][i] for result in results] + plt.bar(ind + i*width, times, width, label=f'Run {i+1}') + + plt.xlabel('Number of Images') + plt.ylabel('Time (seconds)') + plt.title('Model Training Benchmark Times') + + # Set x-axis labels + plt.xticks(ind + width, [str(result['num_images']) for result in results]) + + plt.legend() + plt.tight_layout() + + # Save plot + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + dst_pth = os.path.join(out_dir, f'benchmark_results_{timestamp}.png') + plt.savefig(dst_pth) + plt.close() + + return dst_pth + +def main(): + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + args = parse_args() + ensure_dir_exists(args.out_dir) + csv_filename = os.path.join(args.out_dir, f'benchmark_results_{timestamp}.csv') + results = [] + + # Write CSV header + with open(csv_filename, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['Num Images', 'Run 1', 'Run 2', 'Run 3', 'Average', 'Best']) + + for num_images in args.image_sizes: + print(f"\nBenchmarking with NUM_IMAGES = {num_images}") + with tempfile.TemporaryDirectory() as tmpdir: + h5_path = create_sample_data(num_images, tmpdir, args.mnist_image_size, args.color_dim) + timer = timeit.Timer(lambda: train_model(num_images, args.epochs, h5_path)) + + try: + times = timer.repeat(repeat=3, number=1) + avg_time = sum(times) / len(times) + best_time = min(times) + + # Store results for plotting + results.append({ + 'num_images': num_images, + 'times': times, + 'average': avg_time, + 'best': best_time + }) + + # Write results to CSV + with open(csv_filename, 'a', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([ + num_images, + round(times[0], 3), + round(times[1], 3), + round(times[2], 3), + round(avg_time, 3), + round(best_time, 3) + ]) + + print(f"Times for each run (seconds): {[round(t, 3) for t in times]}") + print(f"Average time (seconds): {round(avg_time, 3)}") + print(f"Best time (seconds): {round(best_time, 3)}") + + except Exception as e: + print(f"Error during benchmark: {str(e)}") + # Write error to CSV + with open(csv_filename, 'a', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([num_images, f"Error: {str(e)}", "", "", "", ""]) + finally: + shutil.rmtree(tmpdir) + + # Create and save the plot + out_file = create_benchmark_plot(results, args.out_dir) + print(f"\nResults saved to {csv_filename}") + print(f"Plot saved as {out_file}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py b/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py new file mode 100644 index 0000000..9d7e121 --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/generate_MNIST_image.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +''' +Generate a .tiff with numbers from MNIST +''' + +import os +import argparse +import random + +import numpy as np +import pandas as pd +import tifffile +from PIL import Image +from torchvision.datasets import MNIST + +def parse_args(): + # Parse arguments + parser = argparse.ArgumentParser(description="Generate a pyramidal MNIST image.") + parser.add_argument('--root_dataset_path', type=str, default="/data/aza4423_anders/mnist", help='Path to download and store MNIST dataset') + #parser.add_argument('--num_images', type=int, default=244 * 244, help='Number of random MNIST images to use') + parser.add_argument('--num_images', type=int, default=4, help='Number of random MNIST images to use') + parser.add_argument('--output_path', type=str, default="/data/aza4423_anders/aml-dsa/mnist_pyramid.tif", help='Output path for the pyramidal TIF file') + parser.add_argument('--test', default=False, type=bool, action=argparse.BooleanOptionalAction, + metavar='T', + help='whether to use test MNIST or train' + ) + + args = parser.parse_args() + + return args + +def d_to_rgb(d): + r = d & 0xFF + g = (d >> 8) & 0xFF + b = (d >> 16) & 0xFF + return [r, g, b] + + +def create_mnist_image(root_dataset_path=".", num_images=100, output_path="./out", test=False, start_value=0): + # verify that num_images has a square root; otherwise we'd have to insert blank tiles for the uneven grid + assert num_images % np.sqrt(num_images) == 0 + + # Download MNIST (if not already downloaded) + dataset = MNIST(root=root_dataset_path, train=not test, download=True) + + # Select N random MNIST images (each image is PIL.Image in mode "L") + # (Make the number square-rootable) + num_images = num_images # Number of images from argument + # oversample if we want more images than the length of MNIST + if num_images > len(dataset): + indices = random.choices(range(len(dataset)), k=num_images) + else: + indices = list(range(num_images)) + random.shuffle(indices) + + #indices = random.sample(range(len(dataset)), num_images) + mnist_images = [np.array(dataset[i][0]) for i in indices] # each is 28x28, uint8 + mnist_labels = [np.array(dataset[i][1]) for i in indices] + + # Arrange the images in a grid (so num_images should be a number with an integer root) + tile_rows, tile_cols = int(np.sqrt(num_images)), int(np.sqrt(num_images)) + tile_h, tile_w = mnist_images[0].shape # typically 28x28 + grid_h, grid_w = tile_rows * tile_h, tile_cols * tile_w + base_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8) + pm_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8) + + for idx, img in enumerate(mnist_images): + r = idx // tile_cols + c = idx % tile_cols + # convert img to RGB + rgb_img = np.stack([img, img, img], axis=-1) + base_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = rgb_img + + value_img = np.zeros((tile_h, tile_w, 3), dtype=np.uint8) + i = (idx + 1) * 2 + rgb = d_to_rgb(i + start_value) + value_img[1:-1, 1:-1] = rgb + rgb = d_to_rgb(i + start_value + 1) + value_img[0, :] = rgb + value_img[-1, :] = rgb + value_img[:, 0] = rgb + value_img[:, -1] = rgb + + pm_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = value_img + + + # Note: We assume that the base level corresponds to 40x magnification. + # Now, build a pyramid (list of downsampled images). + pyramid_pm = [pm_image] + pm_current = pm_image.copy() + + pyramid = [base_image] + current = base_image.copy() + # Continue downsampling by a factor of 2 until one dimension becomes very small. + while min(current.shape) >= 64: + # Use Pillow to resize (ANTIALIAS gives good quality downsampling) + im = Image.fromarray(current) + new_w, new_h = current.shape[1] // 2, current.shape[0] // 2 + if new_w < 1 or new_h < 1: + break + im_resized = im.resize((new_w, new_h)) + current = np.array(im_resized) + pyramid.append(current) + + im = Image.fromarray(pm_image) + new_w, new_h = pm_current.shape[1] // 2, pm_current.shape[0] // 2 + if new_w < 1 or new_h < 1: + break + im_resized = im.resize((new_w, new_h)) + pm_current = np.array(im_resized) + pyramid_pm.append(current) + + # Save the image as a pyramidal TIFF. + # The base image is the main image and the pyramid list (excluding the base) is saved as subIFDs. + output_filename = output_path # Use the output path from argument + if os.path.dirname(output_filename): + os.makedirs(os.path.dirname(output_filename), exist_ok=True) + if os.path.exists(output_filename): + os.remove(output_filename) + + with tifffile.TiffWriter(output_filename, bigtiff=False) as tif: + tif.write(pyramid[0], + tile=(tile_w * 4, tile_h * 4), + photometric='RGB', + description='Whole-slide MNIST image at 40x magnification', + subifds=pyramid[1:]) + print(f"Pyramidal TIFF saved as {output_filename}") + + output_filename_pm = output_filename + ".pixelmap.tiff" # Use the output path from argument + if os.path.dirname(output_filename_pm): + os.makedirs(os.path.dirname(output_filename_pm), exist_ok=True) + if os.path.exists(output_filename_pm): + os.remove(output_filename_pm) + with tifffile.TiffWriter(output_filename_pm, bigtiff=False) as tif: + tif.write(pyramid_pm[0], + tile=(tile_w * 4, tile_h * 4), + photometric='RGB', + description='Pixelmap for Whole-slide MNIST image at 40x magnification', + subifds=pyramid_pm[1:]) + print(f"Pyramidal TIFF saved as {output_filename_pm}") + + # generate a corresponding CSV "cells" file + # with headers "x,y,w,h" for each image + csv_filename = output_filename + "_cells.csv" + with open(csv_filename, 'w') as f: + f.write("x,y,w,h,value\n") + i = 0 + for r in range(tile_rows): + for c in range(tile_cols): + x, y = c * tile_w, r * tile_h + f.write(f"{x},{y},{tile_w},{tile_h},{mnist_labels[i]}\n") + i += 1 + df = pd.read_csv(csv_filename, header=0) + print(f"Annotation file saved as {csv_filename}") + return output_filename, output_filename_pm, df + +if __name__ == "__main__": + _args = parse_args() + create_mnist_image(_args.root_dataset_path, _args.num_images, _args.output_path, _args.test) diff --git a/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py b/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py new file mode 100644 index 0000000..2c17864 --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_feature_extract.py @@ -0,0 +1,218 @@ +import os +import shutil +import sys +import tempfile +from unittest.mock import MagicMock + +import h5py +import large_image +import numpy as np +import pytest + +# make pythonpath work out of the box - although your editor may complain +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from progress_helper import ProgressHelper +from tests.generate_MNIST_image import create_mnist_image + +from xdg_base_dirs import ( xdg_cache_home, ) + +NUM_IMAGES = 64 + +@pytest.fixture(scope="session") +def create_sample_data(): + global NUM_IMAGES + with tempfile.TemporaryDirectory() as tmpdirname: + tiff_path = os.path.join(tmpdirname, "test_mnist.tiff") + #tiff_path_pm = os.path.join(tmpdirname, "test_mnist.tiff.pixelmap.tiff") + + tiff_path, tiff_path_pm, labels = create_mnist_image( + root_dataset_path=xdg_cache_home(), + num_images=NUM_IMAGES, + output_path=tiff_path, + test=False, + ) + # 0 is background + labels['value'] = labels['value'] + 1 + + # we use yield so that the temporarydirectory is still open in the tests + yield tiff_path, tiff_path_pm, NUM_IMAGES, labels + +MNIST_IMAGE_SIZE=28 +COLOR_DIM = 3 + +def test_cutoff(create_sample_data): + global MNIST_IMAGE_SIZE, COLOR_DIM + test_image_pth, test_image_pth_pm, num_images, labels = create_sample_data + base = SuperpixelClassificationBase() + + # Create test data + item = { + 'name': test_image_pth, + 'largeImage': {'fileId': 'test_image_id'} + } + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + if "pixelmap" in dst: + if not os.path.exists(dst): + return shutil.copy(test_image_pth_pm, dst) + else: + if not os.path.exists(dst): + return shutil.copy(test_image_pth, dst) + return None + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + gc.getItem = MagicMock(return_value={'name': test_image_pth_pm, 'largeImage': {'fileId': 'foobar'}}) + def mv_to_src(_, src): + dst = os.path.dirname(test_image_pth) + return shutil.copy(src, dst) + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'}) + #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'}) + + bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()] + + elem = { + 'girderId': 'test_girder_id', + 'values': + [] \ + + list(labels['value'])[:-2] + + [0, 0], # last two images unlabeled + 'user': { + 'bbox': [item for sublist in bboxes for item in sublist] + }, + 'transform': {'matrix': [[1.0]]} + } + + filename = 'test_features.h5' + h5_file = os.path.join(os.path.dirname(test_image_pth), filename) + if os.path.exists(h5_file): + os.remove(h5_file) + + assert not os.path.exists(h5_file) + + cutoff = 1 + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items([item]) + result = base.createFeaturesForItem( + gc=gc, + item=item, + elem=elem, + featureFolderId='test_folder_id', + fileName=filename, + patchSize=MNIST_IMAGE_SIZE, + prog=prog, + cutoff=cutoff, + ) + + assert os.path.exists(h5_file), f"Output file {h5_file} does not exist" + with h5py.File(h5_file, 'r') as ffptr: + assert 'images' in ffptr + assert ffptr['images'].shape == (NUM_IMAGES - cutoff, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM) + assert len(ffptr['used_indices']) == NUM_IMAGES - cutoff # number of labeled - cutoff + +def test_create_features_for_item(create_sample_data): + global MNIST_IMAGE_SIZE, COLOR_DIM + test_image_pth, test_image_pth_pm, num_images, labels = create_sample_data + base = SuperpixelClassificationBase() + + # Create test data + item = { + 'name': test_image_pth, + 'largeImage': {'fileId': 'test_image_id'} + } + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + if "pixelmap" in dst: + if not os.path.exists(dst): + return shutil.copy(test_image_pth_pm, dst) + else: + if not os.path.exists(dst): + return shutil.copy(test_image_pth, dst) + return None + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + gc.getItem = MagicMock(return_value={'name': test_image_pth_pm, 'largeImage': {'fileId': 'foobar'}}) + def mv_to_src(_, src): + dst = os.path.dirname(test_image_pth) + return shutil.copy(src, dst) + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'}) + #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'}) + + bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()] + + elem = { + 'girderId': 'test_girder_id', + 'values': + [] \ + + list(labels['value'])[:-2] + + [0, 0], # last two images unlabeled + 'user': { + 'bbox': [item for sublist in bboxes for item in sublist] + }, + 'transform': {'matrix': [[1.0]]} + } + + filename = 'test_features.h5' + h5_file = os.path.join(os.path.dirname(test_image_pth), filename) + if os.path.exists(h5_file): + os.remove(h5_file) + + assert not os.path.exists(h5_file) + + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items([item]) + result = base.createFeaturesForItem( + gc=gc, + item=item, + elem=elem, + featureFolderId='test_folder_id', + fileName=filename, + patchSize=MNIST_IMAGE_SIZE, + prog=prog, + cutoff=9999 + ) + + assert os.path.exists(h5_file), f"Output file {h5_file} does not exist" + with h5py.File(h5_file, 'r') as ffptr: + assert 'images' in ffptr + assert ffptr['images'].shape == (num_images, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM) + feature_img = ffptr['images'][0] + # open test_image_pth using coordinates [x,y,w,h] from elem['user']['bbox'][:4] and make sure it's pixel-equal with first_img + x, y, x2, y2 = elem['user']['bbox'][:4] + ts = large_image.getTileSource(test_image_pth) + orig_image = ts.getRegion( + region=dict(left=x, top=y, right=x2, bottom=y2), + format=large_image.tilesource.TILE_FORMAT_NUMPY + )[0] + orig_image = orig_image.astype(feature_img.dtype) + print(orig_image.dtype) + np.testing.assert_array_equal(orig_image, feature_img) + + # also check that the last image matches + feature_img = ffptr['images'][-1] + x, y, x2, y2 = elem['user']['bbox'][-4:] + ts = large_image.getTileSource(test_image_pth) + orig_image = ts.getRegion( + region=dict(left=x, top=y, right=x2, bottom=y2), + format=large_image.tilesource.TILE_FORMAT_NUMPY + )[0] + orig_image = orig_image.astype(feature_img.dtype) + print(orig_image.dtype) + np.testing.assert_array_equal(orig_image, feature_img) + + assert 'used_indices' in ffptr + assert len(ffptr['used_indices']) == num_images + + # Assertions + assert result == h5_file + assert gc.downloadFile.call_count == 2 # Called for both image and mask + assert gc.getItem.call_count == 1 + assert gc.uploadFileToFolder.call_count == 1 diff --git a/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py b/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py new file mode 100644 index 0000000..03c6b8a --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_full_training_cycle.py @@ -0,0 +1,524 @@ +''' +This file contains tests for a full training cycle: extracting superpixels, training and evaluation. +The "cycle" is: + 1. generate NUM_WSIS different whole slide images using numbers from MNIST. + 2. extract features from said images. + 3. train a model on the features. + 4. evaluate the model on the features. +We expect an accuracy of at least 90%. + +This test is to verify that the training cycle works as expected. +Since there is batching involved, we want to use a larger number of samples instead of just a quick mini-test, as found in the other files. +''' +import argparse +import glob +import json +import os +import re +import shutil +import sys +import tempfile +from unittest.mock import MagicMock + +import numpy as np +import pytest +from xdg_base_dirs import (xdg_cache_home, ) + +# make pythonpath work out of the box - although your editor may complain +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from SuperpixelClassificationTensorflow import SuperpixelClassificationTensorflow +from SuperpixelClassificationTorch import SuperpixelClassificationTorch +from tests.generate_MNIST_image import create_mnist_image + +NUM_WSIS = 2 +MNIST_IMAGE_SIZE = 28 +NUM_IMAGES_PER_WSI = 10 ** 2 +COLOR_DIM = 3 +PATCH_SIZE = 100 # only size compatible with pytorch model for the time being (since there are hardcoded sizes in the definition of the model) +NUM_EPOCHS = 5 + +@pytest.fixture(scope="function") +def create_sample_data(request): + global NUM_WSIS, NUM_IMAGES_PER_WSI + wsi_paths, pm_paths, list_labels = [], [], [] + with tempfile.TemporaryDirectory() as tmpdirname: + for i in range(NUM_WSIS): + tiff_path = os.path.join(tmpdirname, f"test_mnist_{i}.tiff") + + tiff_path, tiff_path_pm, labels = create_mnist_image( + root_dataset_path=xdg_cache_home(), + num_images=NUM_IMAGES_PER_WSI, + output_path=tiff_path, + test=False, + start_value = request.param + ) + # where labels['value'] == 0, put 10 instead, since 0 will be reserved for unlabeled + labels.loc[labels['value'] == 0, 'value'] = 10 + + wsi_paths.append(tiff_path) + pm_paths.append(tiff_path_pm) + list_labels.append(labels) + + # we use yield so that the temporarydirectory is still open in the tests + yield wsi_paths, pm_paths, NUM_WSIS, list_labels + +@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~5-10 min), run only if you want to") +@pytest.mark.parametrize('create_sample_data', [0], indirect=True) +def test_main_pytorch(create_sample_data): + global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS + tiff_paths, tiff_path_pms, num_images, labels = create_sample_data + base: SuperpixelClassificationBase = SuperpixelClassificationTorch() + + annotation_name = 'torchMNISTtest' + config = dict( + annotationDir = 'annotationdir', + annotationName = annotation_name, + batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time + certainty = 'batchbald', + cutoff = 600000, # plenty of space to allow all training samples + epochs = NUM_EPOCHS, + exclude = [], + feature = 'patch', + features = 'featuredir', + gensuperpixels = False, + girderApiUrl = 'http://localhost:8080/api/v1', + girderToken = '', + heatmaps = False, + images = 'imagedir', + labels = '', + magnification = 40.0, + modeldir = '', + numWorkers = 1, + patchSize = PATCH_SIZE, + radius = MNIST_IMAGE_SIZE, + randominput = False, + split = 0.7, + train = True, + useCuda = True, + progress = True, + ) + args = argparse.Namespace(**config) + + mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] + + items = [] + for i in range(NUM_WSIS): + bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()] + elem = { + 'girderId': f'test_girder_id{i}', + 'categories': [ + {"label": c} for c in mnist_labels + ], + 'values': labels[i]['value'].tolist(), + 'user': { + 'bbox': [item for sublist in bboxes for item in sublist] + }, + 'transform': {'matrix': [[1.0]]} + } + item = { + '_id': f'test_file{i}', + 'name': os.path.basename(tiff_paths[i]), + 'largeImage': {'fileId': f'test_image_id{i}'}, + } + mask_item = { + '_id': f'test_file{i}', + 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]), + 'largeImage': {'fileId': f'test_mask_id{i}'}, + } + annotrec = { + '_id': f'test_file{i}', + '_version': 0, + 'annotation': {'name': 'TorchTest'}, + } + items.append((item, annotrec, elem)) + + + gc = MagicMock() + base.getItemsAndAnnotations = MagicMock(return_value=items) + + with tempfile.TemporaryDirectory() as tmpdirname: + def mv_to_dst(req_pth : str, dst : str): + if req_pth.startswith("test_"): + for f in tiff_paths + tiff_path_pms: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"Copied {f} to {dst}") + elif req_pth.startswith("feature"): + feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5")) + for f in feature_files: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"Copied {f} to {dst}") + elif req_pth.endswith("model"): + model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.pth"))[0] + shutil.copy(model_file, dst) + elif "modtraining" in req_pth: + model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0] + shutil.copy(model_file, dst) + else: + print(f"Received unknown request path '{req_pth}'") + return {} + + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(req, src, reference=None): + shutil.copy(src, tmpdirname) + print(f"Copied {src} to {tmpdirname}") + # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later + if src.endswith(".anot"): + # extract the number at the end of req, which can look like "testfile1" or "testfile1000" + m = re.search(r'(\d+)$', req) + num = int(m.group(1)) + s = os.path.basename(src).replace(".anot", f"_{num}.myanot") + shutil.copy(src, os.path.join(tmpdirname, s)) + print(f"Also copied {s} to {tmpdirname}") + return {'_id': 'feature', 'name': os.path.basename(src)} + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + gc.getItem = MagicMock(return_value=mask_item) + + gc.listResource = MagicMock(return_value=[dict(name=f"{annotation_name}model", _id = 'model'), dict(name=f"{annotation_name}modtraining", _id = 'modtraining')]) + gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True) + gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id')) + + def list_file(req: str, limit: int = 0) -> iter: + if "modtraining" in req: + return iter([dict(name=req, _id = 'modtraining')]) + else: + return iter([dict(name=req, _id='model')]) + gc.listFile = MagicMock(side_effect=list_file) + + base.main(args, gc) + + for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))): + assert os.path.exists(file) + with open(file, 'r') as f: + pred_json = json.load(f) + e = pred_json['elements'][0] + assert len(e['values']) == NUM_IMAGES_PER_WSI + + assert len(e['user']['bbox']) == NUM_IMAGES_PER_WSI * 4 # 4 is for x,y,w,h + + assert len(e['categories']) == len(mnist_labels) - 1 # -1 because we don't have a default category + assert len(e['user']['confidence']) == NUM_IMAGES_PER_WSI + + # compare e['values'] to labels['values'], to make sure we've trained a valid model + # the order of the values is shuffled in the annotation file, the ordering is in e['categories'] + file_num = int(file.split('Predictions_')[-1].split('.myanot')[0]) + predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']]) + matches = (predicted_labels == np.array(list(map(str, labels[file_num]['value'])))) + similarity = matches.sum() / len(matches) + expected_min_accuracy = 0.75 + assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}" + print(f"Similarity between predicted values and GT is {similarity}") + +@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~1-10 min), run only if you want to") +@pytest.mark.parametrize('create_sample_data', [0], indirect=True) +def test_main_tf(create_sample_data): + global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS + tiff_paths, tiff_path_pms, num_images, labels = create_sample_data + base: SuperpixelClassificationBase = SuperpixelClassificationTensorflow() + + annotation_name = 'tensorflowMNISTtest' + config = dict( + annotationDir = 'annotationdir', + annotationName = annotation_name, + batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time + certainty = 'confidence', + cutoff = 600000, # plenty of space to allow all training samples + epochs = NUM_EPOCHS, + exclude = [], + feature = 'patch', + features = 'featuredir', + gensuperpixels = False, + girderApiUrl = 'http://localhost:8080/api/v1', + girderToken = '', + heatmaps = False, + images = 'imagedir', + labels = '', + magnification = 40.0, + modeldir = 'modeldir', + numWorkers = 1, + patchSize = PATCH_SIZE, + radius = MNIST_IMAGE_SIZE, + randominput = False, + split = 0.7, + train = True, + useCuda = False, + progress = True, + ) + args = argparse.Namespace(**config) + + mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] + + items = [] + for i in range(NUM_WSIS): + bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()] + elem = { + 'girderId': f'test_girder_id{i}', + 'categories': [ + {"label": c} for c in mnist_labels + ], + 'values': labels[i]['value'].tolist(), + 'user': { + 'bbox': [item for sublist in bboxes for item in sublist] + }, + 'transform': {'matrix': [[1.0]]} + } + item = { + '_id': f'test_file{i}', + 'name': os.path.basename(tiff_paths[i]), + 'largeImage': {'fileId': f'test_image_id{i}'}, + } + mask_item = { + '_id': f'test_file{i}', + 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]), + 'largeImage': {'fileId': f'test_mask_id{i}'}, + } + annotrec = { + '_id': f'test_file{i}', + '_version': 0, + 'annotation': {'name': 'TorchTest'}, + } + items.append((item, annotrec, elem)) + + + gc = MagicMock() + base.getItemsAndAnnotations = MagicMock(return_value=items) + + with tempfile.TemporaryDirectory() as tmpdirname: + def mv_to_dst(req_pth : str, dst : str): + if req_pth.startswith("test_"): + for f in tiff_paths + tiff_path_pms: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"MockDownload: Copied {f} to {dst}") + elif req_pth.startswith("feature"): + feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5")) + for f in feature_files: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"MockDownload: Copied {f} to {dst}") + elif req_pth.endswith("model"): + model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.h5"))[0] + shutil.copy(model_file, dst) + elif "modtraining" in req_pth: + model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0] + shutil.copy(model_file, dst) + else: + raise RuntimeError(f"Received unknown request path '{req_pth}'") + return {} + + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(req, src, reference=None): + shutil.copy(src, tmpdirname) + print(f"MockUpload: Copied {src} to {tmpdirname}") + # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later + if src.endswith(".anot"): + # extract the number at the end of req, which can look like "testfile1" or "testfile1000" + m = re.search(r'(\d+)$', req) + num = int(m.group(1)) + s = os.path.basename(src).replace(".anot", f"_{num}.myanot") + shutil.copy(src, os.path.join(tmpdirname, s)) + print(f"Also copied {s} to {tmpdirname}") + return {'_id': 'feature', 'name': os.path.basename(src)} + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + gc.getItem = MagicMock(return_value=mask_item) + + modelName = f"{annotation_name} Model Epoch 0.h5" + modTrainingName = f"{annotation_name} ModTraining Epoch 0.h5" + gc.listResource = MagicMock(return_value=[dict(name=modelName, _id = 'model'), dict(name=modTrainingName, _id = 'modtraining')]) + gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True) + gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id')) + + def list_file(req: str, limit: int = 0) -> iter: + if "modtraining" in req: + return iter([dict(name=modTrainingName, _id = 'modtraining')]) + else: + return iter([dict(name=modelName, _id='model')]) + gc.listFile = MagicMock(side_effect=list_file) + + base.main(args, gc) + + for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))): + assert os.path.exists(file) + with open(file, 'r') as f: + pred_json = json.load(f) + e = pred_json['elements'][0] + assert len(e['values']) == NUM_IMAGES_PER_WSI + + assert len(e['user']['bbox']) == NUM_IMAGES_PER_WSI * 4 # 4 is for x,y,w,h + + assert len(e['categories']) == len(mnist_labels) - 1 # exclude the default category + assert len(e['user']['confidence']) == NUM_IMAGES_PER_WSI + + # compare e['values'] to labels['values'], to make sure we've trained a valid model + # the order of the values is shuffled in the annotation file, the ordering is in e['categories'] + file_num = int(file.split('Predictions_')[-1].split('.myanot')[0]) + predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']]) + matches = (predicted_labels == np.array(list(map(str, labels[file_num]['value'])))) + similarity = matches.sum() / len(matches) + expected_min_accuracy = 0.75 + assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}" + print(f"Similarity between predicted values and GT is {similarity}") + +@pytest.mark.skipif("RUNALL" not in os.environ, reason="this is a slow test (~1-10 min), run only if you want to") +@pytest.mark.parametrize('create_sample_data', [2], indirect=True) +def test_main_tf_with_background(create_sample_data): + global NUM_WSIS, PATCH_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM, NUM_EPOCHS + tiff_paths, tiff_path_pms, num_images, labels = create_sample_data + base: SuperpixelClassificationBase = SuperpixelClassificationTensorflow() + + annotation_name = 'tensorflowMNISTtest' + config = dict( + annotationDir = 'annotationdir', + annotationName = annotation_name, + batchSize = int(np.sqrt(NUM_IMAGES_PER_WSI)), # one row of the wsi at a time + certainty = 'confidence', + cutoff = 600000, # plenty of space to allow all training samples + epochs = NUM_EPOCHS, + exclude = [], + feature = 'patch', + features = 'featuredir', + gensuperpixels = False, + girderApiUrl = 'http://localhost:8080/api/v1', + girderToken = '', + heatmaps = False, + images = 'imagedir', + labels = '', + magnification = 40.0, + modeldir = 'modeldir', + numWorkers = 1, + patchSize = PATCH_SIZE, + radius = MNIST_IMAGE_SIZE, + randominput = False, + split = 0.7, + train = True, + useCuda = False, + progress = True, + ) + args = argparse.Namespace(**config) + + mnist_labels = ['default', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] + + items = [] + for i in range(NUM_WSIS): + bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[i][['x', 'y', 'w', 'h']].iterrows()] + elem = { + 'girderId': f'test_girder_id{i}', + 'categories': [ + {"label": c} for c in mnist_labels + ], + 'values': [0] + labels[i]['value'].tolist(), + 'user': { + 'bbox': [0,0,1,1] + [item for sublist in bboxes for item in sublist] + }, + 'transform': {'matrix': [[1.0]]} + } + item = { + '_id': f'test_file{i}', + 'name': os.path.basename(tiff_paths[i]), + 'largeImage': {'fileId': f'test_image_id{i}'}, + } + mask_item = { + '_id': f'test_file{i}', + 'name': '.tiff'.join(os.path.basename(tiff_path_pms[i]).split('.tiff')[:-1]), + 'largeImage': {'fileId': f'test_mask_id{i}'}, + } + annotrec = { + '_id': f'test_file{i}', + '_version': 0, + 'annotation': {'name': 'TorchTest'}, + } + items.append((item, annotrec, elem)) + + + gc = MagicMock() + base.getItemsAndAnnotations = MagicMock(return_value=items) + + with tempfile.TemporaryDirectory() as tmpdirname: + def mv_to_dst(req_pth : str, dst : str): + if req_pth.startswith("test_"): + for f in tiff_paths + tiff_path_pms: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"MockDownload: Copied {f} to {dst}") + elif req_pth.startswith("feature"): + feature_files = glob.glob(os.path.join(tmpdirname, "*feature.h5")) + for f in feature_files: + dpath = os.path.join(dst, os.path.basename(f)) + if not os.path.exists(dpath) and os.path.basename(f) == os.path.basename(dst): + shutil.copy(f, dst) + print(f"MockDownload: Copied {f} to {dst}") + elif req_pth.endswith("model"): + model_file = glob.glob(os.path.join(tmpdirname, f"*Model *{0}.h5"))[0] + shutil.copy(model_file, dst) + elif "modtraining" in req_pth: + model_file = glob.glob(os.path.join(tmpdirname, f"*ModTraining *{0}.h5"))[0] + shutil.copy(model_file, dst) + else: + raise RuntimeError(f"Received unknown request path '{req_pth}'") + return {} + + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(req, src, reference=None): + shutil.copy(src, tmpdirname) + print(f"MockUpload: Copied {src} to {tmpdirname}") + # each WSI gets two separate .anot files. The below if statement gives them unique filenames so we can reference later + if src.endswith(".anot"): + # extract the number at the end of req, which can look like "testfile1" or "testfile1000" + m = re.search(r'(\d+)$', req) + num = int(m.group(1)) + s = os.path.basename(src).replace(".anot", f"_{num}.myanot") + shutil.copy(src, os.path.join(tmpdirname, s)) + print(f"Also copied {s} to {tmpdirname}") + return {'_id': 'feature', 'name': os.path.basename(src)} + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + gc.getItem = MagicMock(return_value=mask_item) + + modelName = f"{annotation_name} Model Epoch 0.h5" + modTrainingName = f"{annotation_name} ModTraining Epoch 0.h5" + gc.listResource = MagicMock(return_value=[dict(name=modelName, _id = 'model'), dict(name=modTrainingName, _id = 'modtraining')]) + gc.uploadFileToItem = MagicMock(side_effect=mv_to_src, return_value=True) + gc.getFolder = MagicMock(return_value=dict(name='test_folder', creatorId='creatorId', _id='test_folder_id')) + + def list_file(req: str, limit: int = 0) -> iter: + if "modtraining" in req: + return iter([dict(name=modTrainingName, _id = 'modtraining')]) + else: + return iter([dict(name=modelName, _id='model')]) + gc.listFile = MagicMock(side_effect=list_file) + + base.main(args, gc) + + for file in sorted(glob.glob(os.path.join(tmpdirname, f"*Predictions*.myanot"))): + assert os.path.exists(file) + with open(file, 'r') as f: + pred_json = json.load(f) + e = pred_json['elements'][0] + assert len(e['values']) == NUM_IMAGES_PER_WSI + 1 + + assert len(e['user']['bbox']) == (NUM_IMAGES_PER_WSI + 1) * 4 # 4 is for x,y,w,h + + assert len(e['categories']) == len(mnist_labels) - 1 # exclude the default category + assert len(e['user']['confidence']) == (NUM_IMAGES_PER_WSI + 1) + + # compare e['values'] to labels['values'], to make sure we've trained a valid model + # the order of the values is shuffled in the annotation file, the ordering is in e['categories'] + file_num = int(file.split('Predictions_')[-1].split('.myanot')[0]) + predicted_labels = np.array([e['categories'][c]['label'] for c in e['values']]) + assert e['values'][0] == 0, "Background should have prediction 0" + matches = (predicted_labels == np.array([e['values'][0]] + list(map(str, labels[file_num]['value'])))) + similarity = matches.sum() / len(matches) + expected_min_accuracy = 0.75 + assert similarity > expected_min_accuracy, f"File {file}: Similarity between predicted values and GT is {similarity}, expected > {expected_min_accuracy}" + print(f"Similarity between predicted values and GT is {similarity}") diff --git a/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py b/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py new file mode 100644 index 0000000..5fc814f --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_gen_superpixels.py @@ -0,0 +1,164 @@ +import os +import shutil +import sys +import tempfile +from unittest.mock import MagicMock + +import h5py +import large_image +import numpy as np +import pytest +from PIL.Image import Image +from tifffile import tifffile + +# make pythonpath work out of the box - although your editor may complain +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from progress_helper import ProgressHelper +from tests.generate_MNIST_image import create_mnist_image + +from xdg_base_dirs import ( xdg_cache_home, ) + +NUM_IMAGES : int = 9 +IMAGE_SIZE : int = 16 # 16 is the smallest tile size for .TIFFs, although we could operate within a single tile, too. +COLOR_DIM = 3 + + +def d_to_rgb(d): + r = d & 0xFF + g = (d >> 8) & 0xFF + b = (d >> 16) & 0xFF + return [r, g, b] + +@pytest.fixture(scope="session") +def create_sample_data(): + ''' + Create a sample WSI for testing. + ''' + global NUM_IMAGES, IMAGE_SIZE + num_images = NUM_IMAGES + with tempfile.TemporaryDirectory() as tmpdirname: + output_filename = os.path.join(tmpdirname, "test.tiff") + + if os.path.dirname(output_filename): + os.makedirs(os.path.dirname(output_filename), exist_ok=True) + if os.path.exists(output_filename): + os.remove(output_filename) + + # Arrange the images in a grid (so num_images should be a number with an integer root) + tile_rows, tile_cols = int(np.sqrt(num_images)), int(np.sqrt(num_images)) + tile_h, tile_w = 16, 16 + grid_h, grid_w = tile_rows * tile_h, tile_cols * tile_w + base_image = np.zeros((grid_h, grid_w, 3), dtype=np.uint8) + + vals = np.array([0, 127, 255], dtype=np.uint8) + colors = np.stack(np.meshgrid(vals, vals, vals), axis=-1).reshape(-1, 3)[:NUM_IMAGES] + images = np.tile(colors[:, None, None, :], (1, IMAGE_SIZE, IMAGE_SIZE, 1)) + + for idx, img in enumerate(images): + r = idx // tile_cols + c = idx % tile_cols + base_image[r*tile_h:(r+1)*tile_h, c*tile_w:(c+1)*tile_w, :] = img + + pyramid = [base_image] + current = base_image.copy() + while min(current.shape) >= 64: + # Use Pillow to resize (ANTIALIAS gives good quality downsampling) + im = Image.fromarray(current) + new_w, new_h = current.shape[1] // 2, current.shape[0] // 2 + if new_w < 1 or new_h < 1: + break + im_resized = im.resize((new_w, new_h)) + current = np.array(im_resized) + pyramid.append(current) + + # Save the image as a pyramidal TIFF. + # The base image is the main image and the pyramid list (excluding the base) is saved as subIFDs. + if os.path.dirname(output_filename): + os.makedirs(os.path.dirname(output_filename), exist_ok=True) + if os.path.exists(output_filename): + os.remove(output_filename) + + with tifffile.TiffWriter(output_filename, bigtiff=False) as tif: + tif.write(pyramid[0], + tile=(tile_w * 4, tile_h * 4), + photometric='RGB', + description='Whole-slide MNIST image at 40x magnification', + subifds=pyramid[1:]) + print(f"Pyramidal TIFF saved as {output_filename}") + + # we use yield so that the temporarydirectory is still open in the tests + yield output_filename, images + +def test_gen_superpixel(create_sample_data): + global IMAGE_SIZE, COLOR_DIM + test_image_pth, test_images = create_sample_data + base = SuperpixelClassificationBase() + + # Create test data + item = { + "_id": "test_item_id", + 'largeImage': {'fileId': 'test_image_id'}, + 'name': test_image_pth, + } + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + if not os.path.exists(os.path.join(dst, test_image_pth)): + shutil.copy(test_image_pth, dst) + print(">>> Copied file from", test_image_pth, "to", dst) + return None + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + gc.getItem = MagicMock(return_value={'name': test_image_pth, 'largeImage': {'fileId': 'foobar'}}) + def mv_to_src(_, src): + dst = os.path.dirname(test_image_pth) + if not os.path.exists(os.path.join(dst, src)): + shutil.copy(src, dst) + print(">>> Copied file from", src, "to", dst) + return {'itemId': 'uploaded_item_id'} + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value={'_id': 'test_file_id'}) + #gc.uploadFileToFolder = MagicMock(return_value={'_id': 'test_file_id'}) + + #bboxes = [[x, y, w + x, y + h] for _, (x, y, w, h) in labels[['x', 'y', 'w', 'h']].iterrows()] + bboxes = [[x, x, x + IMAGE_SIZE, x + IMAGE_SIZE] for x in range(0, NUM_IMAGES, IMAGE_SIZE)] + + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items([item]) + result = base.createSuperpixelsForItem( + gc=gc, + annotationName="TorchTest", + item=item, + radius=IMAGE_SIZE, + magnification=40, + annotationFolderId='annotation_folder_id', + userId="user_id", + prog=prog, + ) + + out_pixelmap_file = os.path.join(os.path.dirname(test_image_pth), '%s.pixelmap.tiff' % item['name']) + assert os.path.exists(out_pixelmap_file), f"Output file {out_pixelmap_file} does not exist" + x, y, x2, y2 = 0, 0, IMAGE_SIZE, IMAGE_SIZE + ts = large_image.getTileSource(test_image_pth) + orig_image = ts.getRegion( + region=dict(left=x, top=y, right=x2, bottom=y2), + format=large_image.tilesource.TILE_FORMAT_NUMPY + )[0] + # test that all values in orig_image is equal to 1 + # TODO: waiting for another PR: want this to be 1 + assert np.all(orig_image == 0) + + feature_img = test_images[-1] + x, y, x2, y2 = IMAGE_SIZE * (IMAGE_SIZE - 1), IMAGE_SIZE * (IMAGE_SIZE - 1), IMAGE_SIZE * IMAGE_SIZE, IMAGE_SIZE * IMAGE_SIZE + ts = large_image.getTileSource(test_image_pth) + orig_image = ts.getRegion( + region=dict(left=x, top=y, right=x2, bottom=y2), + format=large_image.tilesource.TILE_FORMAT_NUMPY + )[0] + orig_image = orig_image.astype(feature_img.dtype) + # TODO: same as TODO above + assert np.all(orig_image == NUM_IMAGES - 1) \ No newline at end of file diff --git a/superpixel_classification/SuperpixelClassification/tests/test_predict.py b/superpixel_classification/SuperpixelClassification/tests/test_predict.py new file mode 100644 index 0000000..9341a90 --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_predict.py @@ -0,0 +1,254 @@ +import json +import os +import shutil +import tempfile +from unittest.mock import MagicMock + +import h5py +import numpy as np +import pytest +import torch + +# make pythonpath work out of the box - although your editor may complain +import sys +import os +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from SuperpixelClassificationTorch import SuperpixelClassificationTorch, _BayesianPatchTorchModel +from progress_helper import ProgressHelper +from tests.validate_json_annotation import validate_json_file + +# currently, torch model only supports 100x100 +MNIST_IMAGE_SIZE=100 +COLOR_DIM = 3 +NUM_IMAGES = 64 +CUTOFF_IMAGES = 2 + +@pytest.fixture(scope="session") +def create_sample_data(): + global NUM_IMAGES, CUTOFF_IMAGES + with tempfile.TemporaryDirectory() as tmpdirname: + h5_path = os.path.join(tmpdirname, "test_data.h5") + + images = np.random.randint(0, 255, size=(NUM_IMAGES - CUTOFF_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8) + indices = np.arange(NUM_IMAGES - CUTOFF_IMAGES) + assert images.shape[0] == indices.shape[0] + + with h5py.File(h5_path, 'w') as f: + f.create_dataset('images', data=images) + f.create_dataset('used_indices', data=indices, dtype='i') + + # we use yield so that the temporarydirectory is still open in the tests + yield h5_path + +''' +This test checks to predictions on a dataset that is only labeled with two values of out ten categories. +''' +def test_subset_labels(create_sample_data): + global NUM_IMAGES, CUTOFF_IMAGES + h5_path = create_sample_data + base: SuperpixelClassificationBase = SuperpixelClassificationTorch() + base.certainty = 'batchbald' + base.feature_is_image = True + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + return shutil.copy(h5_path, dst) + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + gc.uploadFileToItem = MagicMock() + + feature = { + '_id': '0', + 'name': 'my_test_feature' + } + labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + annotrec = { + 'annotation': { + 'attributes': {}, + 'name': 'TorchTest', + }, + } + + # make a list with values 1 and 3 in a random order, and NUM_IMAGES entries + value_list = [1, 3] * (NUM_IMAGES // 2) + + elem = { + "type": "pixelmap", + "girderId": "6838aab654f0ca783ff03871", + "transform": {"matrix": [[1.0, 0], [0, 1.0]]}, + 'values': value_list, + 'categories' : [{"label": k, "fillColor": "rgba(0,0,0,0)"} for k in labels], + "boundaries": True, + "id": "myid", + 'user': { }, + } + + groups = { k: {"label": k, "fillColor": "rgba(0,0,0,0)", "strokeColor": "rgba(0,0,0,0)" } for k in labels } + + device = torch.device("cpu") + model = _BayesianPatchTorchModel(len(labels), device) + model.device = device + + items = [(feature, annotrec, elem)] + item = {'_id': 0, 'name': 'my_item', 'largeImage': {'fileId': 'test_image_id'}} + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items(items) + + annotation_name = 'testannotation' + with tempfile.TemporaryDirectory() as tmpdirname: + base.predictLabelsForItem( + gc=gc, + annotationName=annotation_name, + tempdir=tmpdirname, + model=model, + item=item, + annotrec=annotrec, + elem=elem, + feature=feature, + curEpoch=0, + userId='user_id', + labels=labels, + groups=groups, + makeHeatmaps=False, + radius=-1, + magnification=40.0, + certainty='batchbald', + batchSize=NUM_IMAGES, + use_cuda = False, + prog=prog, + ) + out_pth = os.path.join(tmpdirname, '%s Epoch 0 Predictions.anot' % annotation_name) + assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth + with open(out_pth, 'r') as f: + pred_json = json.load(f) + e = pred_json['elements'][0] + assert len(e['values']) == NUM_IMAGES + for i in range(1, CUTOFF_IMAGES): + assert e['values'][-i] == 0, "Expected unknown/none label for cutoff images" + assert len(e['categories']) == len(labels) + assert len(e['user']['confidence']) == NUM_IMAGES + assert len(e['user']['categoryConfidence']) == NUM_IMAGES + assert len(e['user']['categoryConfidence'][0]) == len(labels) + assert len(e['user']['certainty']) == NUM_IMAGES + for i in range(1, CUTOFF_IMAGES): + assert e['user']['certainty'][-i] > 10000, "Expected certainty to be very high for unlabeled samples to ensure they occur last in the AL filmstrip (DSA)" + assert 'percentiles' in e['user']['certainty_info'] + assert 'cdf' in e['user']['certainty_info'] + + validate_json_file(out_pth) + + out_pth = os.path.join(tmpdirname, '%s Epoch 1.anot' % annotation_name) + assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth + with open(out_pth, 'r') as f: + annotation_file = json.load(f) + e = annotation_file['elements'][0] + assert len(e['values']) == NUM_IMAGES + assert len(e['categories']) == len(labels) + + validate_json_file(out_pth) + +def test_predict_unlabeled_with_cutoff(create_sample_data): + global NUM_IMAGES, CUTOFF_IMAGES + h5_path = create_sample_data + base: SuperpixelClassificationBase = SuperpixelClassificationTorch() + base.certainty = 'batchbald' + base.feature_is_image = True + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + return shutil.copy(h5_path, dst) + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + gc.uploadFileToItem = MagicMock() + + feature = { + '_id': '0', + 'name': 'my_test_feature' + } + labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + annotrec = { + 'annotation': { + 'attributes': {}, + 'name': 'TorchTest', + }, + } + + elem = { + "type": "pixelmap", + "girderId": "6838aab654f0ca783ff03871", + "transform": {"matrix": [[1.0, 0], [0, 1.0]]}, + 'values': [0] * NUM_IMAGES, + 'categories' : [{"label": k, "fillColor": "rgba(0,0,0,0)"} for k in labels], + "boundaries": True, + "id": "myid", + 'user': { }, + } + + groups = { k: {"label": k, "fillColor": "rgba(0,0,0,0)", "strokeColor": "rgba(0,0,0,0)" } for k in labels } + + device = torch.device("cpu") + model = _BayesianPatchTorchModel(len(labels), device) + model.device = device + + items = [(feature, annotrec, elem)] + item = {'_id': 0, 'name': 'my_item', 'largeImage': {'fileId': 'test_image_id'}} + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items(items) + + annotation_name = 'testannotation' + with tempfile.TemporaryDirectory() as tmpdirname: + base.predictLabelsForItem( + gc=gc, + annotationName=annotation_name, + tempdir=tmpdirname, + model=model, + item=item, + annotrec=annotrec, + elem=elem, + feature=feature, + curEpoch=0, + userId='user_id', + labels=labels, + groups=groups, + makeHeatmaps=False, + radius=-1, + magnification=40.0, + certainty='batchbald', + batchSize=NUM_IMAGES, + use_cuda = False, + prog=prog, + ) + out_pth = os.path.join(tmpdirname, '%s Epoch 0 Predictions.anot' % annotation_name) + assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth + with open(out_pth, 'r') as f: + pred_json = json.load(f) + e = pred_json['elements'][0] + assert len(e['values']) == NUM_IMAGES + for i in range(1, CUTOFF_IMAGES): + assert e['values'][-i] == 0, "Expected unknown/none label for cutoff images" + assert len(e['categories']) == len(labels) + assert len(e['user']['confidence']) == NUM_IMAGES + assert len(e['user']['categoryConfidence']) == NUM_IMAGES + assert len(e['user']['categoryConfidence'][0]) == len(labels) + assert len(e['user']['certainty']) == NUM_IMAGES + for i in range(1, CUTOFF_IMAGES): + assert e['user']['certainty'][-i] > 10000, "Expected certainty to be very high for unlabeled samples to ensure they occur last in the AL filmstrip (DSA)" + assert 'percentiles' in e['user']['certainty_info'] + assert 'cdf' in e['user']['certainty_info'] + + validate_json_file(out_pth) + + out_pth = os.path.join(tmpdirname, '%s Epoch 1.anot' % annotation_name) + assert os.path.exists(out_pth), "Output file %s does not exist" % out_pth + with open(out_pth, 'r') as f: + annotation_file = json.load(f) + e = annotation_file['elements'][0] + assert len(e['values']) == NUM_IMAGES + assert len(e['categories']) == len(labels) + + validate_json_file(out_pth) diff --git a/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py b/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py new file mode 100644 index 0000000..1a40365 --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_tensorflow.py @@ -0,0 +1,93 @@ +import os +import shutil +import tempfile +from unittest.mock import MagicMock + +import h5py +import numpy as np +import pytest + +# make pythonpath work out of the box - although your editor may complain +import sys +import os +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from SuperpixelClassificationTensorflow import SuperpixelClassificationTensorflow +from progress_helper import ProgressHelper + +MNIST_IMAGE_SIZE=28 +COLOR_DIM = 3 +NUM_IMAGES = 64 + +@pytest.fixture(scope="session") +def create_sample_data(): + global NUM_IMAGES + with tempfile.TemporaryDirectory() as tmpdirname: + h5_path = os.path.join(tmpdirname, "test_data.h5") + images = np.random.randint(0, 255, size=(NUM_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8) + + with h5py.File(h5_path, 'w') as f: + f.create_dataset('images', data=images) + f.create_dataset('used_indices', data=np.arange(NUM_IMAGES - 2)) + + # we use yield so that that the temporarydirectory is still open in the tests + yield h5_path + +def test_train_model(create_sample_data): + global NUM_IMAGES + h5_path = create_sample_data + base: SuperpixelClassificationBase + base = SuperpixelClassificationTensorflow() + base.feature_is_image = True + base.certainty = 'not batchbald' # same as using tensorflow + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + return shutil.copy(h5_path, dst) + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(_, src): + dst = os.path.dirname(os.path.dirname(h5_path)) + return shutil.copy(src, dst) + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + elem = { + 'girderId': 'test_girder_id', + 'categories': [ + {"label": c} for c in labels + ], + 'values': + [] \ + + np.random.randint(1, len(labels) - 1, size=(NUM_IMAGES - 2), dtype=np.uint8).tolist() + + [0, 0], # last two images unlabeled + 'transform': {'matrix': [[1.0]]} + } + + item = {'_id': 'test_h5_file', 'name': 'test'} + annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}} + items = [(item, annotrec, elem)] + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items(items) + modelFile, modelTrainingFile = base.trainModel( + annotationName="TorchTest", + batchSize = 4, + epochs = 1, + excludeLabelList = [], + features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}}, + gc=gc, + itemsAndAnnot=items, + labelList = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], + modelFolderId="test_folder_id", + prog=prog, + randomInput = False, + trainingSplit = 0.5, + use_cuda = False, + ) + + assert os.path.exists(modelFile) + assert os.path.exists(modelTrainingFile) \ No newline at end of file diff --git a/superpixel_classification/SuperpixelClassification/tests/test_torch.py b/superpixel_classification/SuperpixelClassification/tests/test_torch.py new file mode 100644 index 0000000..edb7dbc --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/test_torch.py @@ -0,0 +1,94 @@ +import os +import shutil +import tempfile +from unittest.mock import MagicMock + +import h5py +import numpy as np +import pytest + +# make pythonpath work out of the box - although your editor may complain +import sys +import os +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.dirname(SCRIPT_DIR)) + +from SuperpixelClassificationBase import SuperpixelClassificationBase +from SuperpixelClassificationTorch import SuperpixelClassificationTorch +from progress_helper import ProgressHelper + +# currently, torch model only supports 100x100 +MNIST_IMAGE_SIZE=100 +COLOR_DIM = 3 +NUM_IMAGES = 64 + +@pytest.fixture(scope="session") +def create_sample_data(): + global NUM_IMAGES + with tempfile.TemporaryDirectory() as tmpdirname: + h5_path = os.path.join(tmpdirname, "test_data.h5") + images = np.random.randint(0, 255, size=(NUM_IMAGES, MNIST_IMAGE_SIZE, MNIST_IMAGE_SIZE, COLOR_DIM), dtype=np.uint8) + + with h5py.File(h5_path, 'w') as f: + f.create_dataset('images', data=images) + f.create_dataset('used_indices', data=np.arange(NUM_IMAGES - 2)) + + # we use yield so that that the temporarydirectory is still open in the tests + yield h5_path + +def test_train_model(create_sample_data): + global NUM_IMAGES + h5_path = create_sample_data + base: SuperpixelClassificationBase + base = SuperpixelClassificationTorch() + base.feature_is_image = True + base.certainty = 'batchbald' # same as using torch + + # Mock girder client + gc = MagicMock() + def mv_to_dst(_, dst): + return shutil.copy(h5_path, dst) + gc.downloadFile = MagicMock(side_effect=mv_to_dst) + def mv_to_src(_, src): + dst = os.path.dirname(os.path.dirname(h5_path)) + return shutil.copy(src, dst) + gc.uploadFileToFolder = MagicMock(side_effect=mv_to_src, return_value=True) + + labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + elem = { + 'girderId': 'test_girder_id', + 'categories': [ + {"label": c} for c in labels + ], + 'values': + [] \ + + np.random.randint(1, len(labels) - 1, size=(NUM_IMAGES - 2), dtype=np.uint8).tolist() + + [0, 0], # last two images unlabeled + 'transform': {'matrix': [[1.0]]} + } + + item = {'_id': 'test_h5_file', 'name': 'test'} + annotrec = {'_id': '1', '_version': 0, 'annotation': {'name': 'TorchTest'}} + items = [(item, annotrec, elem)] + with ProgressHelper( 'Superpixel Classification', + 'Test feature', False) as prog: + prog.progress(0) + prog.items(items) + modelFile, modelTrainingFile = base.trainModel( + annotationName="TorchTest", + batchSize = 4, + epochs = 1, + excludeLabelList = [], + features={'test_h5_file': {'_id': 'feature_id', 'name': 'test_h5_file'}}, + gc=gc, + itemsAndAnnot=items, + labelList = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], + modelFolderId="test_folder_id", + prog=prog, + randomInput = False, + trainingSplit = 0.5, + use_cuda = True, + ) + + assert os.path.exists(modelFile) + assert os.path.exists(modelTrainingFile) diff --git a/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py b/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py new file mode 100644 index 0000000..5f209ce --- /dev/null +++ b/superpixel_classification/SuperpixelClassification/tests/validate_json_annotation.py @@ -0,0 +1,588 @@ +#!/usr/bin/env python +''' +This code is similar to girder_annotation/girder_large_image_annotation/models/annotation.py +The meaning is to validate the json annotation file without having to use girder or large_image +''' +import argparse +import json +import logging +import os +import sys +import jsonschema +from tqdm import tqdm + +import copy + +def extendSchema(base, add): + extend = copy.deepcopy(base) + for key in add: + if key == 'required' and 'required' in base: + extend[key] = sorted(set(extend[key]) | set(add[key])) + elif key != 'properties' and 'properties' in base: + extend[key] = add[key] + if 'properties' in add: + extend['properties'].update(add['properties']) + return extend + + +colorSchema = { + 'type': 'string', + # We accept colors of the form + # #rrggbb six digit RRGGBB hex + # #rgb three digit RGB hex + # #rrggbbaa eight digit RRGGBBAA hex + # #rgba four digit RGBA hex + # rgb(255, 255, 255) rgb decimal triplet + # rgba(255, 255, 255, 1) rgba quad with RGB in the range [0-255] and + # alpha [0-1] + 'pattern': r'^(#([0-9a-fA-F]{3,4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})|' + r'rgb\(\d+,\s*\d+,\s*\d+\)|' + r'rgba\(\d+,\s*\d+,\s*\d+,\s*(\d?\.|)\d+\))$', +} + +transformArray = { + 'type': 'array', + 'items': { + 'type': 'array', + 'minItems': 2, + 'maxItems': 2, + }, + 'minItems': 2, + 'maxItems': 2, + 'description': 'A 2D matrix representing the transform of an ' + 'image overlay.', +} + + +colorRangeSchema = { + 'type': 'array', + 'items': colorSchema, + 'description': 'A list of colors', +} + +rangeValueSchema = { + 'type': 'array', + 'items': {'type': 'number'}, + 'description': 'A weakly monotonic list of range values', +} + +userSchema = { + 'type': 'object', + 'additionalProperties': True, +} + +labelSchema = { + 'type': 'object', + 'properties': { + 'value': {'type': 'string'}, + 'visibility': { + 'type': 'string', + # TODO: change to True, False, None? + 'enum': ['hidden', 'always', 'onhover'], + }, + 'fontSize': { + 'type': 'number', + 'exclusiveMinimum': 0, + }, + 'color': colorSchema, + }, + 'required': ['value'], + 'additionalProperties': False, +} + +groupSchema = {'type': 'string'} + +baseElementSchema = { + 'type': 'object', + 'properties': { + 'id': { + 'type': 'string', + 'pattern': '^[0-9a-f]{24}$', + }, + 'type': {'type': 'string'}, + # schema free field for users to extend annotations + 'user': userSchema, + 'label': labelSchema, + 'group': groupSchema, + }, + 'required': ['type'], + 'additionalProperties': True, +} +baseShapeSchema = extendSchema(baseElementSchema, { + 'properties': { + 'lineColor': colorSchema, + 'lineWidth': { + 'type': 'number', + 'minimum': 0, + }, + }, +}) + + +pixelmapCategorySchema = { + 'type': 'object', + 'properties': { + 'fillColor': colorSchema, + 'strokeColor': colorSchema, + 'label': { + 'type': 'string', + 'description': 'A string representing the semantic ' + 'meaning of regions of the map with ' + 'the corresponding color.', + }, + 'description': { + 'type': 'string', + 'description': 'A more detailed explanation of the ' + 'meaining of this category.', + }, + }, + 'required': ['fillColor'], + 'additionalProperties': False, +} + +_annotationSchema = { + 'type': 'object', + 'properties': { + 'value': colorSchema, + 'id': colorSchema, + 'label': { + 'type': 'string', + 'description': 'A string representing the semantic ' + 'meaning of regions of the map with ' + 'the corresponding color.', + }, + 'description': { + 'type': 'string', + 'description': 'A more detailed explanation of the ' + 'meaining of this category.', + }, + }, + 'required': ['fillColor'], + 'additionalProperties': False, +} + + +overlaySchema = extendSchema(baseElementSchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['image'], + }, + 'girderId': { + 'type': 'string', + 'pattern': '^[0-9a-f]{24}$', + 'description': 'Girder item ID containing the image to ' + 'overlay.', + }, + 'opacity': { + 'type': 'number', + 'minimum': 0, + 'maximum': 1, + 'description': 'Default opacity for this image overlay. Must ' + 'be between 0 and 1. Defaults to 1.', + }, + 'hasAlpha': { + 'type': 'boolean', + 'description': + 'If true, the image is treated assuming it has an alpha ' + 'channel.', + }, + 'transform': { + 'type': 'object', + 'description': 'Specification for an affine transform of the ' + 'image overlay. Includes a 2D transform matrix, ' + 'an X offset and a Y offset.', + 'properties': { + 'xoffset': { + 'type': 'number', + }, + 'yoffset': { + 'type': 'number', + }, + 'matrix': transformArray, + }, + }, + }, + 'required': ['girderId', 'type'], + 'additionalProperties': False, + 'description': 'An image overlay on top of the base resource.', +}) + + +pixelmapSchema = extendSchema(overlaySchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['pixelmap'], + }, + 'values': { + 'type': 'array', + 'items': {'type': 'integer'}, + 'description': 'An array where the indices ' + 'correspond to pixel values in the ' + 'pixel map image and the values are ' + 'used to look up the appropriate ' + 'color in the categories property.', + }, + 'categories': { + 'type': 'array', + 'items': pixelmapCategorySchema, + 'description': 'An array used to map between the ' + 'values array and color values. ' + 'Can also contain semantic ' + 'information for color values.', + }, + 'boundaries': { + 'type': 'boolean', + 'description': 'True if the pixelmap doubles pixel ' + 'values such that even values are the ' + 'fill and odd values the are stroke ' + 'of each superpixel. If true, the ' + 'length of the values array should be ' + 'half of the maximum value in the ' + 'pixelmap.', + + }, + }, + 'required': ['values', 'categories', 'boundaries'], + 'additionalProperties': False, + 'description': 'A tiled pixelmap to overlay onto a base resource.', +}) + +bboxSchema = extendSchema(overlaySchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['bboxmap'], + }, + 'categories': { + 'type': 'array', + 'items': pixelmapCategorySchema, + 'description': 'An array used to map between the ' + 'values array and color values. ' + 'Can also contain semantic ' + 'information for color values.', + }, + 'annotations': { + 'type': 'array', + 'description': 'Value, id, and bounding box for each annotation', + 'items': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'value': { + 'type': 'integer', + }, + 'id': { + 'type': 'integer', + }, + 'bbox': { + 'type': 'array', + 'items': {'type': 'number'}, + 'minItems': 4, + 'maxItems': 4, + 'description': 'Bounding box in the form ' + '[left, top, right, bottom].', + }, + } + } + }, + 'boundaries': { + 'type': 'boolean', + 'description': 'True if the pixelmap doubles pixel ' + 'values such that even values are the ' + 'fill and odd values the are stroke ' + 'of each superpixel. If true, the ' + 'length of the values array should be ' + 'half of the maximum value in the ' + 'pixelmap.', + + }, + }, + 'required': ['categories', 'boundaries', 'annotations'], + 'additionalProperties': True, + 'description': 'A tiled pixelmap to overlay onto a base resource.', +}) + +annotationElementSchema = { + # Shape subtypes are mutually exclusive, so for efficiency, don't use + # 'oneOf' + 'anyOf': [ + pixelmapSchema, + bboxSchema, + ], +} + + +class AnnotationSchema: + annotationSchema = { + '$schema': 'http://json-schema.org/schema#', + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string', + # TODO: Disallow empty? + 'minLength': 1, + }, + 'description': {'type': 'string'}, + 'display': { + 'type': 'object', + 'properties': { + 'visible': { + 'type': ['boolean', 'string'], + 'enum': ['new', True, False], + 'description': 'This advises viewers on when the ' + 'annotation should be shown. If "new" (the default), ' + 'show the annotation when it is first added to the ' + "system. If false, don't show the annotation by " + 'default. If true, show the annotation when the item ' + 'is displayed.', + }, + }, + }, + 'attributes': { + 'type': 'object', + 'additionalProperties': True, + 'title': 'Image Attributes', + 'description': 'Subjective things that apply to the entire ' + 'image.', + }, + 'elements': { + 'type': 'array', + 'items': annotationElementSchema, + # We want to ensure unique element IDs, if they are set. If + # they are not set, we assign them from Mongo. + 'title': 'Image Markup', + 'description': 'Subjective things that apply to a ' + 'spatial region.', + }, + }, + 'additionalProperties': False, + } + + + + coordSchema = { + 'type': 'array', + # TODO: validate that z==0 for now + 'items': { + 'type': 'number', + }, + 'minItems': 3, + 'maxItems': 3, + 'name': 'Coordinate', + # TODO: define origin for 3D images + 'description': 'An X, Y, Z coordinate tuple, in base layer pixel ' + 'coordinates, where the origin is the upper-left.', + } + coordValueSchema = { + 'type': 'array', + 'items': { + 'type': 'number', + }, + 'minItems': 4, + 'maxItems': 4, + 'name': 'CoordinateWithValue', + 'description': 'An X, Y, Z, value coordinate tuple, in base layer ' + 'pixel coordinates, where the origin is the upper-left.', + } + + colorSchema = { + 'type': 'string', + # We accept colors of the form + # #rrggbb six digit RRGGBB hex + # #rgb three digit RGB hex + # #rrggbbaa eight digit RRGGBBAA hex + # #rgba four digit RGBA hex + # rgb(255, 255, 255) rgb decimal triplet + # rgba(255, 255, 255, 1) rgba quad with RGB in the range [0-255] and + # alpha [0-1] + # TODO: make rgb and rgba spec validate that rgb is [0-255] and a is + # [0-1], rather than just checking if they are digits and such. + 'pattern': r'^(#([0-9a-fA-F]{3,4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})|' + r'rgb\(\d+,\s*\d+,\s*\d+\)|' + r'rgba\(\d+,\s*\d+,\s*\d+,\s*(\d?\.|)\d+\))$', + } + + colorRangeSchema = { + 'type': 'array', + 'items': colorSchema, + 'description': 'A list of colors', + } + + rangeValueSchema = { + 'type': 'array', + 'items': {'type': 'number'}, + 'description': 'A weakly monotonic list of range values', + } + + userSchema = { + 'type': 'object', + 'additionalProperties': True, + } + + labelSchema = { + 'type': 'object', + 'properties': { + 'value': {'type': 'string'}, + 'visibility': { + 'type': 'string', + # TODO: change to True, False, None? + 'enum': ['hidden', 'always', 'onhover'], + }, + 'fontSize': { + 'type': 'number', + 'exclusiveMinimum': 0, + }, + 'color': colorSchema, + }, + 'required': ['value'], + 'additionalProperties': False, + } + + groupSchema = {'type': 'string'} + + baseElementSchema = { + 'type': 'object', + 'properties': { + 'id': { + 'type': 'string', + 'pattern': '^[0-9a-f]{24}$', + }, + 'type': {'type': 'string'}, + # schema free field for users to extend annotations + 'user': userSchema, + 'label': labelSchema, + 'group': groupSchema, + }, + 'required': ['type'], + 'additionalProperties': True, + } + baseShapeSchema = extendSchema(baseElementSchema, { + 'properties': { + 'lineColor': colorSchema, + 'lineWidth': { + 'type': 'number', + 'minimum': 0, + }, + }, + }) + + pointShapeSchema = extendSchema(baseShapeSchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['point'], + }, + 'center': coordSchema, + 'fillColor': colorSchema, + }, + 'required': ['type', 'center'], + 'additionalProperties': False, + }) + + arrowShapeSchema = extendSchema(baseShapeSchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['arrow'], + }, + 'points': { + 'type': 'array', + 'items': coordSchema, + 'minItems': 2, + 'maxItems': 2, + }, + 'fillColor': colorSchema, + }, + 'description': 'The first point is the head of the arrow', + 'required': ['type', 'points'], + 'additionalProperties': False, + }) + + circleShapeSchema = extendSchema(baseShapeSchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['circle'], + }, + 'center': coordSchema, + 'radius': { + 'type': 'number', + 'minimum': 0, + }, + 'fillColor': colorSchema, + }, + 'required': ['type', 'center', 'radius'], + 'additionalProperties': False, + }) + + polylineShapeSchema = extendSchema(baseShapeSchema, { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['polyline'], + }, + 'points': { + 'type': 'array', + 'items': coordSchema, + 'minItems': 2, + }, + 'fillColor': colorSchema, + 'closed': { + 'type': 'boolean', + 'description': 'polyline is open if closed flag is ' + 'not specified', + }, + 'holes': { + 'type': 'array', + 'description': + 'If closed is true, this is a list of polylines that are ' + 'treated as holes in the base polygon. These should not ' + 'cross each other and should be contained within the base ' + 'polygon.', + 'items': { + 'type': 'array', + 'items': coordSchema, + 'minItems': 3, + }, + }, + }, + 'required': ['type', 'points'], + 'additionalProperties': False, + }) + + +def validate_annotation(annotation_dict): + validator = jsonschema.Draft6Validator(AnnotationSchema.annotationSchema) + validatorElement = jsonschema.Draft6Validator(AnnotationSchema.baseElementSchema) + + validator.validate(annotation_dict) + for element in tqdm(annotation_dict['elements']): + validatorElement.validate(element) + +def validate_json_file(json_dst): + with open(json_dst, 'r') as f: + data = json.load(f) + validate_annotation(data) + # num_elem = len(data['elements'][0]['annotations']) + # if num_elem % 4 != 0: + # raise ValueError(f"Number of elements ({num_elem}) is not a multiple of 4") + # num_values = len(data['elements'][0]['annotations']) + # if int(num_elem / 4) != num_values: + # raise ValueError(f"Number of elements ({num_elem / 4}) does not match values ({num_values})") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Validate a json annotation file') + parser.add_argument('--input', default=os.path.join("out", "superpixel.anot"), type=str, + help='Name of input json file with a pixelmap annotation"') + args = parser.parse_args() + # Call the function with the filenames + logging.basicConfig(stream=sys.stdout, level=logging.INFO) + + if not os.path.exists(args.input): + logging.error(f"Annotation path {args.input} does not exist") + sys.exit(1) + + validate_json_file(args.input) + logging.info("Done validating annotation ['%s']", args.input) diff --git a/tools/inspect_image_feature_file.py b/tools/inspect_image_feature_file.py new file mode 100644 index 0000000..a93d911 --- /dev/null +++ b/tools/inspect_image_feature_file.py @@ -0,0 +1,37 @@ +''' +This script will open a feature file (.h5) and show a 3x3 grid of images. +This tool is useful if you suspect that features are not extracted properly, for example due to erroneous mask values/indexing. +''' + +import h5py +import matplotlib.pyplot as plt +import numpy as np +import sys + +if len(sys.argv) > 0: + feature_file = sys.argv[1] +else: + feature_file = "features.h5" + +# open the file +with h5py.File(feature_file, "r") as f: + # get the images dataset + images = f["images"] + # get the first 9 images + images = images[:9] + # reshape the images to 3x3 + #images = np.reshape(images, (3,3,100,100,3)) + # transpose the images to 3x3 + #images = np.transpose(images, (0,2,1,3,4)) + # flatten the images to 9x100x100x3 + #images = np.reshape(images, (9,100,100,3)) + + # hide axis from pyplot + plt.axis('off') + + # plot the images + for i in range(9): + plt.subplot(3,3,i+1) + plt.imshow(images[i]) + plt.show() + print(f"Image {i+1} is {images[i].shape}")