src.fairreckitlib.data.set.dataset_registry

This module contains the data registry class.

Classes:

DataRegistry: registry for available datasets after processing them into a standard format.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This module contains the data registry class.
  2
  3Classes:
  4
  5    DataRegistry: registry for available datasets after processing them into a standard format.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12import os
 13from typing import Any, Dict, List, Optional
 14
 15from ...core.io.io_utility import save_yml
 16from .dataset_config_parser import DatasetConfigParser
 17from .dataset_constants import DATASET_CONFIG_FILE
 18from .dataset import Dataset
 19from .processor.dataset_processor_lfm1b import DatasetProcessorLFM1B
 20from .processor.dataset_processor_lfm2b import DatasetProcessorLFM2B
 21from .processor.dataset_processor_lfm360k import DatasetProcessorLFM360K
 22from .processor.dataset_processor_ml100k import DatasetProcessorML100K
 23from .processor.dataset_processor_ml25m import DatasetProcessorML25M
 24
 25DATASET_LFM_1B = 'LFM-1B'
 26DATASET_LFM_2B = 'LFM-2B'
 27DATASET_LFM_360K = 'LFM-360K'
 28DATASET_ML_100K = 'ML-100K'
 29DATASET_ML_25M = 'ML-25M'
 30
 31
 32class DataRegistry:
 33    """Data Registry with available datasets.
 34
 35    The data directory is expected to exist or will raise an IOError.
 36    Each subdirectory is considered to store a single dataset. The name of
 37    the subdirectory needs to be exactly the same as one of the available
 38    processors to trigger automatic data processing.
 39
 40    Public methods:
 41
 42    get_available_processors
 43    get_available_sets
 44    get_info
 45    get_set
 46    """
 47
 48    def __init__(self, data_dir: str, verbose: bool=True):
 49        """Construct the data registry and scan for available datasets.
 50
 51        Args:
 52            data_dir: path to the directory that contains the datasets.
 53            verbose: whether the dataset parser should give verbose output.
 54
 55        Raises:
 56            IOError: when the specified data directory does not exist.
 57        """
 58        if not os.path.isdir(data_dir):
 59            raise IOError('Unable to construct data registry from an unknown directory')
 60
 61        self.registry = {}
 62        self.processors = {
 63            DATASET_LFM_1B: DatasetProcessorLFM1B,
 64            DATASET_LFM_2B: DatasetProcessorLFM2B,
 65            DATASET_LFM_360K: DatasetProcessorLFM360K,
 66            DATASET_ML_100K: DatasetProcessorML100K,
 67            DATASET_ML_25M: DatasetProcessorML25M
 68        }
 69
 70        for file in os.listdir(data_dir):
 71            file_name = os.fsdecode(file)
 72            dataset_dir = os.path.join(data_dir, file_name)
 73            # skip all entries that are not a directory
 74            if not os.path.isdir(dataset_dir):
 75                continue
 76
 77            config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE)
 78            if not os.path.isfile(config_file_path):
 79                if self.processors.get(file_name) is None:
 80                    print('Unknown dataset processor:', file_name)
 81                    continue
 82
 83                config = self.processors[file_name](dataset_dir, file_name).run()
 84                if config is None:
 85                    print('Processing dataset failed:', file_name)
 86                    continue
 87
 88                save_yml(config_file_path, config.to_yml_format())
 89            else:
 90                parser = DatasetConfigParser(verbose)
 91                config = parser.parse_dataset_config_from_yml(
 92                    dataset_dir,
 93                    DATASET_CONFIG_FILE,
 94                    self.get_available_sets()
 95                )
 96                if config is None:
 97                    print('Parsing dataset configuration failed:', file_name)
 98                    continue
 99
100            self.registry[config.dataset_name] = Dataset(dataset_dir, config)
101
102    def get_available_processors(self) -> List[str]:
103        """Get the names of the available processors in the registry.
104
105        Returns:
106            a list of data processor names.
107        """
108        processor_names = []
109
110        for processor_name in self.processors:
111            processor_names.append(processor_name)
112
113        return processor_names
114
115    def get_available_sets(self) -> List[str]:
116        """Get the names of the available datasets in the registry.
117
118        Returns:
119            a list of dataset names.
120        """
121        dataset_names = []
122
123        for dataset_name in self.registry:
124            dataset_names.append(dataset_name)
125
126        return dataset_names
127
128    def get_info(self) -> Dict[str, Dict[str, Any]]:
129        """Get the matrices' information for each available dataset.
130
131        Returns:
132            a dictionary where the key corresponds to the dataset name and
133                the value corresponds to the matrices' information dictionary.
134        """
135        info = {}
136
137        for dataset_name, dataset in self.registry.items():
138            info[dataset_name] = dataset.get_matrices_info()
139
140        return info
141
142    def get_set(self, dataset_name: str) -> Optional[Dataset]:
143        """Get the dataset with the specified name.
144
145        Args:
146            dataset_name: name of the dataset to retrieve.
147
148        Returns:
149            the retrieved set or None when not present.
150        """
151        return self.registry.get(dataset_name)
class DataRegistry:
 33class DataRegistry:
 34    """Data Registry with available datasets.
 35
 36    The data directory is expected to exist or will raise an IOError.
 37    Each subdirectory is considered to store a single dataset. The name of
 38    the subdirectory needs to be exactly the same as one of the available
 39    processors to trigger automatic data processing.
 40
 41    Public methods:
 42
 43    get_available_processors
 44    get_available_sets
 45    get_info
 46    get_set
 47    """
 48
 49    def __init__(self, data_dir: str, verbose: bool=True):
 50        """Construct the data registry and scan for available datasets.
 51
 52        Args:
 53            data_dir: path to the directory that contains the datasets.
 54            verbose: whether the dataset parser should give verbose output.
 55
 56        Raises:
 57            IOError: when the specified data directory does not exist.
 58        """
 59        if not os.path.isdir(data_dir):
 60            raise IOError('Unable to construct data registry from an unknown directory')
 61
 62        self.registry = {}
 63        self.processors = {
 64            DATASET_LFM_1B: DatasetProcessorLFM1B,
 65            DATASET_LFM_2B: DatasetProcessorLFM2B,
 66            DATASET_LFM_360K: DatasetProcessorLFM360K,
 67            DATASET_ML_100K: DatasetProcessorML100K,
 68            DATASET_ML_25M: DatasetProcessorML25M
 69        }
 70
 71        for file in os.listdir(data_dir):
 72            file_name = os.fsdecode(file)
 73            dataset_dir = os.path.join(data_dir, file_name)
 74            # skip all entries that are not a directory
 75            if not os.path.isdir(dataset_dir):
 76                continue
 77
 78            config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE)
 79            if not os.path.isfile(config_file_path):
 80                if self.processors.get(file_name) is None:
 81                    print('Unknown dataset processor:', file_name)
 82                    continue
 83
 84                config = self.processors[file_name](dataset_dir, file_name).run()
 85                if config is None:
 86                    print('Processing dataset failed:', file_name)
 87                    continue
 88
 89                save_yml(config_file_path, config.to_yml_format())
 90            else:
 91                parser = DatasetConfigParser(verbose)
 92                config = parser.parse_dataset_config_from_yml(
 93                    dataset_dir,
 94                    DATASET_CONFIG_FILE,
 95                    self.get_available_sets()
 96                )
 97                if config is None:
 98                    print('Parsing dataset configuration failed:', file_name)
 99                    continue
100
101            self.registry[config.dataset_name] = Dataset(dataset_dir, config)
102
103    def get_available_processors(self) -> List[str]:
104        """Get the names of the available processors in the registry.
105
106        Returns:
107            a list of data processor names.
108        """
109        processor_names = []
110
111        for processor_name in self.processors:
112            processor_names.append(processor_name)
113
114        return processor_names
115
116    def get_available_sets(self) -> List[str]:
117        """Get the names of the available datasets in the registry.
118
119        Returns:
120            a list of dataset names.
121        """
122        dataset_names = []
123
124        for dataset_name in self.registry:
125            dataset_names.append(dataset_name)
126
127        return dataset_names
128
129    def get_info(self) -> Dict[str, Dict[str, Any]]:
130        """Get the matrices' information for each available dataset.
131
132        Returns:
133            a dictionary where the key corresponds to the dataset name and
134                the value corresponds to the matrices' information dictionary.
135        """
136        info = {}
137
138        for dataset_name, dataset in self.registry.items():
139            info[dataset_name] = dataset.get_matrices_info()
140
141        return info
142
143    def get_set(self, dataset_name: str) -> Optional[Dataset]:
144        """Get the dataset with the specified name.
145
146        Args:
147            dataset_name: name of the dataset to retrieve.
148
149        Returns:
150            the retrieved set or None when not present.
151        """
152        return self.registry.get(dataset_name)

Data Registry with available datasets.

The data directory is expected to exist or will raise an IOError. Each subdirectory is considered to store a single dataset. The name of the subdirectory needs to be exactly the same as one of the available processors to trigger automatic data processing.

Public methods:

get_available_processors get_available_sets get_info get_set

DataRegistry(data_dir: str, verbose: bool = True)
 49    def __init__(self, data_dir: str, verbose: bool=True):
 50        """Construct the data registry and scan for available datasets.
 51
 52        Args:
 53            data_dir: path to the directory that contains the datasets.
 54            verbose: whether the dataset parser should give verbose output.
 55
 56        Raises:
 57            IOError: when the specified data directory does not exist.
 58        """
 59        if not os.path.isdir(data_dir):
 60            raise IOError('Unable to construct data registry from an unknown directory')
 61
 62        self.registry = {}
 63        self.processors = {
 64            DATASET_LFM_1B: DatasetProcessorLFM1B,
 65            DATASET_LFM_2B: DatasetProcessorLFM2B,
 66            DATASET_LFM_360K: DatasetProcessorLFM360K,
 67            DATASET_ML_100K: DatasetProcessorML100K,
 68            DATASET_ML_25M: DatasetProcessorML25M
 69        }
 70
 71        for file in os.listdir(data_dir):
 72            file_name = os.fsdecode(file)
 73            dataset_dir = os.path.join(data_dir, file_name)
 74            # skip all entries that are not a directory
 75            if not os.path.isdir(dataset_dir):
 76                continue
 77
 78            config_file_path = os.path.join(dataset_dir, DATASET_CONFIG_FILE)
 79            if not os.path.isfile(config_file_path):
 80                if self.processors.get(file_name) is None:
 81                    print('Unknown dataset processor:', file_name)
 82                    continue
 83
 84                config = self.processors[file_name](dataset_dir, file_name).run()
 85                if config is None:
 86                    print('Processing dataset failed:', file_name)
 87                    continue
 88
 89                save_yml(config_file_path, config.to_yml_format())
 90            else:
 91                parser = DatasetConfigParser(verbose)
 92                config = parser.parse_dataset_config_from_yml(
 93                    dataset_dir,
 94                    DATASET_CONFIG_FILE,
 95                    self.get_available_sets()
 96                )
 97                if config is None:
 98                    print('Parsing dataset configuration failed:', file_name)
 99                    continue
100
101            self.registry[config.dataset_name] = Dataset(dataset_dir, config)

Construct the data registry and scan for available datasets.

Args: data_dir: path to the directory that contains the datasets. verbose: whether the dataset parser should give verbose output.

Raises: IOError: when the specified data directory does not exist.

def get_available_processors(self) -> List[str]:
103    def get_available_processors(self) -> List[str]:
104        """Get the names of the available processors in the registry.
105
106        Returns:
107            a list of data processor names.
108        """
109        processor_names = []
110
111        for processor_name in self.processors:
112            processor_names.append(processor_name)
113
114        return processor_names

Get the names of the available processors in the registry.

Returns: a list of data processor names.

def get_available_sets(self) -> List[str]:
116    def get_available_sets(self) -> List[str]:
117        """Get the names of the available datasets in the registry.
118
119        Returns:
120            a list of dataset names.
121        """
122        dataset_names = []
123
124        for dataset_name in self.registry:
125            dataset_names.append(dataset_name)
126
127        return dataset_names

Get the names of the available datasets in the registry.

Returns: a list of dataset names.

def get_info(self) -> Dict[str, Dict[str, Any]]:
129    def get_info(self) -> Dict[str, Dict[str, Any]]:
130        """Get the matrices' information for each available dataset.
131
132        Returns:
133            a dictionary where the key corresponds to the dataset name and
134                the value corresponds to the matrices' information dictionary.
135        """
136        info = {}
137
138        for dataset_name, dataset in self.registry.items():
139            info[dataset_name] = dataset.get_matrices_info()
140
141        return info

Get the matrices' information for each available dataset.

Returns: a dictionary where the key corresponds to the dataset name and the value corresponds to the matrices' information dictionary.

def get_set( self, dataset_name: str) -> Optional[src.fairreckitlib.data.set.dataset.Dataset]:
143    def get_set(self, dataset_name: str) -> Optional[Dataset]:
144        """Get the dataset with the specified name.
145
146        Args:
147            dataset_name: name of the dataset to retrieve.
148
149        Returns:
150            the retrieved set or None when not present.
151        """
152        return self.registry.get(dataset_name)

Get the dataset with the specified name.

Args: dataset_name: name of the dataset to retrieve.

Returns: the retrieved set or None when not present.