Source code for tiatoolbox.models.dataset.info
# ***** BEGIN GPL LICENSE BLOCK *****
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# The Original Code is Copyright (C) 2021, TIA Centre, University of Warwick
# All rights reserved.
# ***** END GPL LICENSE BLOCK *****
import os
from abc import ABC, abstractmethod
from pathlib import Path
from tiatoolbox import rcParam
from tiatoolbox.utils.misc import download_data, grab_files_from_dir, unzip_data
[docs]class DatasetInfoABC(ABC):
"""Define an abstract class for holding dataset information.
Enforcing such that following attributes must always be defined by the subclass.
Attributes:
inputs (list): A list of paths where each path points to a sample image.
labels (list): A list of `int` where each is the label of the sample at
the same index.
label_names (dict): A dict indicates the possible associate name of each
label value.
"""
@property
@abstractmethod
def inputs(self):
raise NotImplementedError
@property
@abstractmethod
def labels(self):
raise NotImplementedError
@property
@abstractmethod
def label_names(self):
raise NotImplementedError
[docs]class KatherPatchDataset(DatasetInfoABC):
"""Define a class for holding the Kather dataset information.
Args:
save_dir_path (str or None): Path to directory containing the Kather
dataset. This is assumed to be the same form after the data is initially
downloaded. If the argument is `None`, the dataset will be downloaded
and extracted into the 'run_dir/download/Kather'.
Attributes
inputs (list): A list of paths where each path points to a sample image.
labels (list): A list of `int` where each is the label of the sample at
the same index.
label_names (dict): A dict indicates the possible associate name of each
label value.
"""
# We pre-define to follow enforcement, actual initialization in init
inputs = None
labels = None
label_names = None
def __init__(
self,
save_dir_path=None,
):
label_names = [
"BACK",
"NORM",
"DEB",
"TUM",
"ADI",
"MUC",
"MUS",
"STR",
"LYM",
]
if save_dir_path is None: # pragma: no cover
save_dir_path = Path(rcParam["TIATOOLBOX_HOME"], "dataset")
if not os.path.exists(save_dir_path):
save_zip_path = os.path.join(save_dir_path, "Kather.zip")
url = (
"https://tiatoolbox.dcs.warwick.ac.uk/datasets"
"/kather100k-train-nonorm-subset-20k.zip"
)
download_data(url, save_zip_path)
unzip_data(save_zip_path, save_dir_path)
save_dir_path = Path(save_dir_path, "kather100k-validation")
# bring outside to prevent case where download fail
save_dir_path = Path(save_dir_path)
if not save_dir_path.exists():
raise ValueError(f"Dataset does not exist at `{save_dir_path}`")
# What will happen if downloaded data get corrupted?
uid_name_map = {}
all_paths = []
for label_id, label_name in enumerate(label_names):
paths = grab_files_from_dir(
f"{save_dir_path}/{label_name}/", file_types="*.tif"
)
paths = [[v, label_id] for v in paths]
paths.sort()
all_paths.extend(paths)
uid_name_map[label_id] = label_name
inputs, labels = list(zip(*all_paths))
self.label_names = uid_name_map
self.inputs = list(inputs) # type casting to list
self.labels = list(labels) # type casting to list