src.fairreckitlib.data.set.dataset_registry
This module contains the data registry class.
Classes:
DataRegistry: registry for available datasets after processing them into a standard format.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This module contains the data registry class. 2 3Classes: 4 5 DataRegistry: registry for available datasets after processing them into a standard format. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12import os 13from typing import Any, Dict, List, Optional 14 15from ...core.io.io_utility import save_yml 16from .dataset_config_parser import DatasetConfigParser 17from .dataset_constants import DATASET_CONFIG_FILE 18from .dataset import Dataset 19from .processor.dataset_processor_lfm1b import DatasetProcessorLFM1B 20from .processor.dataset_processor_lfm2b import DatasetProcessorLFM2B 21from .processor.dataset_processor_lfm360k import DatasetProcessorLFM360K 22from .processor.dataset_processor_ml100k import DatasetProcessorML100K 23from .processor.dataset_processor_ml25m import DatasetProcessorML25M 24 25DATASET_LFM_1B = 'LFM-1B' 26DATASET_LFM_2B = 'LFM-2B' 27DATASET_LFM_360K = 'LFM-360K' 28DATASET_ML_100K = 'ML-100K' 29DATASET_ML_25M = 'ML-25M' 30 31 32class DataRegistry: 33 """Data Registry with available datasets. 34 35 The data directory is expected to exist or will raise an IOError. 36 Each subdirectory is considered to store a single dataset. The name of 37 the subdirectory needs to be exactly the same as one of the available 38 processors to trigger automatic data processing. 39 40 Public methods: 41 42 get_available_processors 43 get_available_sets 44 get_info 45 get_set 46 """ 47 48 def __init__(self, data_dir: str, verbose: bool=True): 49 """Construct the data registry and scan for available datasets. 50 51 Args: 52 data_dir: path to the directory that contains the datasets. 53 verbose: whether the dataset parser should give verbose output. 54 55 Raises: 56 IOError: when the specified data directory does not exist. 57 """ 58 if not os.path.isdir(data_dir): 59 raise IOError('Unable to construct data registry from an unknown directory') 60 61 self.registry = {} 62 self.processors = { 63 DATASET_LFM_1B: DatasetProcessorLFM1B, 64 DATASET_LFM_2B: DatasetProcessorLFM2B, 65 DATASET_LFM_360K: DatasetProcessorLFM360K, 66 DATASET_ML_100K: DatasetProcessorML100K, 67 DATASET_ML_25M: DatasetProcessorML25M 68 } 69 70 for file in os.listdir(data_dir): 71 file_name = os.fsdecode(file) 72 dataset_dir = os.path.join(data_dir, file_name) 73 # skip all entries that are not a directory 74 if not os.path.isdir(dataset_dir): 75 continue 76 77 config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE) 78 if not os.path.isfile(config_file_path): 79 if self.processors.get(file_name) is None: 80 print('Unknown dataset processor:', file_name) 81 continue 82 83 config = self.processors[file_name](dataset_dir, file_name).run() 84 if config is None: 85 print('Processing dataset failed:', file_name) 86 continue 87 88 save_yml(config_file_path, config.to_yml_format()) 89 else: 90 parser = DatasetConfigParser(verbose) 91 config = parser.parse_dataset_config_from_yml( 92 dataset_dir, 93 DATASET_CONFIG_FILE, 94 self.get_available_sets() 95 ) 96 if config is None: 97 print('Parsing dataset configuration failed:', file_name) 98 continue 99 100 self.registry[config.dataset_name] = Dataset(dataset_dir, config) 101 102 def get_available_processors(self) -> List[str]: 103 """Get the names of the available processors in the registry. 104 105 Returns: 106 a list of data processor names. 107 """ 108 processor_names = [] 109 110 for processor_name in self.processors: 111 processor_names.append(processor_name) 112 113 return processor_names 114 115 def get_available_sets(self) -> List[str]: 116 """Get the names of the available datasets in the registry. 117 118 Returns: 119 a list of dataset names. 120 """ 121 dataset_names = [] 122 123 for dataset_name in self.registry: 124 dataset_names.append(dataset_name) 125 126 return dataset_names 127 128 def get_info(self) -> Dict[str, Dict[str, Any]]: 129 """Get the matrices' information for each available dataset. 130 131 Returns: 132 a dictionary where the key corresponds to the dataset name and 133 the value corresponds to the matrices' information dictionary. 134 """ 135 info = {} 136 137 for dataset_name, dataset in self.registry.items(): 138 info[dataset_name] = dataset.get_matrices_info() 139 140 return info 141 142 def get_set(self, dataset_name: str) -> Optional[Dataset]: 143 """Get the dataset with the specified name. 144 145 Args: 146 dataset_name: name of the dataset to retrieve. 147 148 Returns: 149 the retrieved set or None when not present. 150 """ 151 return self.registry.get(dataset_name)
33class DataRegistry: 34 """Data Registry with available datasets. 35 36 The data directory is expected to exist or will raise an IOError. 37 Each subdirectory is considered to store a single dataset. The name of 38 the subdirectory needs to be exactly the same as one of the available 39 processors to trigger automatic data processing. 40 41 Public methods: 42 43 get_available_processors 44 get_available_sets 45 get_info 46 get_set 47 """ 48 49 def __init__(self, data_dir: str, verbose: bool=True): 50 """Construct the data registry and scan for available datasets. 51 52 Args: 53 data_dir: path to the directory that contains the datasets. 54 verbose: whether the dataset parser should give verbose output. 55 56 Raises: 57 IOError: when the specified data directory does not exist. 58 """ 59 if not os.path.isdir(data_dir): 60 raise IOError('Unable to construct data registry from an unknown directory') 61 62 self.registry = {} 63 self.processors = { 64 DATASET_LFM_1B: DatasetProcessorLFM1B, 65 DATASET_LFM_2B: DatasetProcessorLFM2B, 66 DATASET_LFM_360K: DatasetProcessorLFM360K, 67 DATASET_ML_100K: DatasetProcessorML100K, 68 DATASET_ML_25M: DatasetProcessorML25M 69 } 70 71 for file in os.listdir(data_dir): 72 file_name = os.fsdecode(file) 73 dataset_dir = os.path.join(data_dir, file_name) 74 # skip all entries that are not a directory 75 if not os.path.isdir(dataset_dir): 76 continue 77 78 config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE) 79 if not os.path.isfile(config_file_path): 80 if self.processors.get(file_name) is None: 81 print('Unknown dataset processor:', file_name) 82 continue 83 84 config = self.processors[file_name](dataset_dir, file_name).run() 85 if config is None: 86 print('Processing dataset failed:', file_name) 87 continue 88 89 save_yml(config_file_path, config.to_yml_format()) 90 else: 91 parser = DatasetConfigParser(verbose) 92 config = parser.parse_dataset_config_from_yml( 93 dataset_dir, 94 DATASET_CONFIG_FILE, 95 self.get_available_sets() 96 ) 97 if config is None: 98 print('Parsing dataset configuration failed:', file_name) 99 continue 100 101 self.registry[config.dataset_name] = Dataset(dataset_dir, config) 102 103 def get_available_processors(self) -> List[str]: 104 """Get the names of the available processors in the registry. 105 106 Returns: 107 a list of data processor names. 108 """ 109 processor_names = [] 110 111 for processor_name in self.processors: 112 processor_names.append(processor_name) 113 114 return processor_names 115 116 def get_available_sets(self) -> List[str]: 117 """Get the names of the available datasets in the registry. 118 119 Returns: 120 a list of dataset names. 121 """ 122 dataset_names = [] 123 124 for dataset_name in self.registry: 125 dataset_names.append(dataset_name) 126 127 return dataset_names 128 129 def get_info(self) -> Dict[str, Dict[str, Any]]: 130 """Get the matrices' information for each available dataset. 131 132 Returns: 133 a dictionary where the key corresponds to the dataset name and 134 the value corresponds to the matrices' information dictionary. 135 """ 136 info = {} 137 138 for dataset_name, dataset in self.registry.items(): 139 info[dataset_name] = dataset.get_matrices_info() 140 141 return info 142 143 def get_set(self, dataset_name: str) -> Optional[Dataset]: 144 """Get the dataset with the specified name. 145 146 Args: 147 dataset_name: name of the dataset to retrieve. 148 149 Returns: 150 the retrieved set or None when not present. 151 """ 152 return self.registry.get(dataset_name)
Data Registry with available datasets.
The data directory is expected to exist or will raise an IOError. Each subdirectory is considered to store a single dataset. The name of the subdirectory needs to be exactly the same as one of the available processors to trigger automatic data processing.
Public methods:
get_available_processors get_available_sets get_info get_set
49 def __init__(self, data_dir: str, verbose: bool=True): 50 """Construct the data registry and scan for available datasets. 51 52 Args: 53 data_dir: path to the directory that contains the datasets. 54 verbose: whether the dataset parser should give verbose output. 55 56 Raises: 57 IOError: when the specified data directory does not exist. 58 """ 59 if not os.path.isdir(data_dir): 60 raise IOError('Unable to construct data registry from an unknown directory') 61 62 self.registry = {} 63 self.processors = { 64 DATASET_LFM_1B: DatasetProcessorLFM1B, 65 DATASET_LFM_2B: DatasetProcessorLFM2B, 66 DATASET_LFM_360K: DatasetProcessorLFM360K, 67 DATASET_ML_100K: DatasetProcessorML100K, 68 DATASET_ML_25M: DatasetProcessorML25M 69 } 70 71 for file in os.listdir(data_dir): 72 file_name = os.fsdecode(file) 73 dataset_dir = os.path.join(data_dir, file_name) 74 # skip all entries that are not a directory 75 if not os.path.isdir(dataset_dir): 76 continue 77 78 config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE) 79 if not os.path.isfile(config_file_path): 80 if self.processors.get(file_name) is None: 81 print('Unknown dataset processor:', file_name) 82 continue 83 84 config = self.processors[file_name](dataset_dir, file_name).run() 85 if config is None: 86 print('Processing dataset failed:', file_name) 87 continue 88 89 save_yml(config_file_path, config.to_yml_format()) 90 else: 91 parser = DatasetConfigParser(verbose) 92 config = parser.parse_dataset_config_from_yml( 93 dataset_dir, 94 DATASET_CONFIG_FILE, 95 self.get_available_sets() 96 ) 97 if config is None: 98 print('Parsing dataset configuration failed:', file_name) 99 continue 100 101 self.registry[config.dataset_name] = Dataset(dataset_dir, config)
Construct the data registry and scan for available datasets.
Args: data_dir: path to the directory that contains the datasets. verbose: whether the dataset parser should give verbose output.
Raises: IOError: when the specified data directory does not exist.
103 def get_available_processors(self) -> List[str]: 104 """Get the names of the available processors in the registry. 105 106 Returns: 107 a list of data processor names. 108 """ 109 processor_names = [] 110 111 for processor_name in self.processors: 112 processor_names.append(processor_name) 113 114 return processor_names
Get the names of the available processors in the registry.
Returns: a list of data processor names.
116 def get_available_sets(self) -> List[str]: 117 """Get the names of the available datasets in the registry. 118 119 Returns: 120 a list of dataset names. 121 """ 122 dataset_names = [] 123 124 for dataset_name in self.registry: 125 dataset_names.append(dataset_name) 126 127 return dataset_names
Get the names of the available datasets in the registry.
Returns: a list of dataset names.
129 def get_info(self) -> Dict[str, Dict[str, Any]]: 130 """Get the matrices' information for each available dataset. 131 132 Returns: 133 a dictionary where the key corresponds to the dataset name and 134 the value corresponds to the matrices' information dictionary. 135 """ 136 info = {} 137 138 for dataset_name, dataset in self.registry.items(): 139 info[dataset_name] = dataset.get_matrices_info() 140 141 return info
Get the matrices' information for each available dataset.
Returns: a dictionary where the key corresponds to the dataset name and the value corresponds to the matrices' information dictionary.
143 def get_set(self, dataset_name: str) -> Optional[Dataset]: 144 """Get the dataset with the specified name. 145 146 Args: 147 dataset_name: name of the dataset to retrieve. 148 149 Returns: 150 the retrieved set or None when not present. 151 """ 152 return self.registry.get(dataset_name)
Get the dataset with the specified name.
Args: dataset_name: name of the dataset to retrieve.
Returns: the retrieved set or None when not present.