src.fairreckitlib.data.set.processor.dataset_processor_base

This module contains the base functionality shared by all dataset processors.

Classes:

DatasetProcessorBase: the base class for dataset processors.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This module contains the base functionality shared by all dataset processors.
  2
  3Classes:
  4
  5    DatasetProcessorBase: the base class for dataset processors.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12from abc import ABCMeta, abstractmethod
 13from typing import Callable, Dict, List, Optional, Tuple
 14
 15from ..dataset_config import DatasetConfig, DatasetMatrixConfig, DatasetTableConfig
 16
 17
 18class DatasetProcessorBase(metaclass=ABCMeta):
 19    """DataProcessor base class for all FairRecKit datasets.
 20
 21    Datasets are preprocessed so that they will be of a recognized standard format
 22    on the other side. A configuration file is produced in the resulting dataset
 23    directory that stores the metadata for achieving this. For further information
 24    it is advised to take a look at the Dataset(Config) class.
 25
 26    The dataset configuration mainly consists of:
 27
 28    1) event tables: contain user event tables that can be used to construct a matrix of.
 29    2) matrix tables: contain available matrices associated with the dataset.
 30    3) (other) tables: contain the shared tables associated with the dataset.
 31
 32    For each of these three categories an abstract function is exposed in order to retrieve
 33    (table name, table configuration processor) tuples. The tables names are expected to be
 34    unique across all categories. The table configuration processors are allowed to return
 35    None on failure and will be excluded from the final configuration. Moreover, tables that
 36    do not contain any records are excluded as well.
 37    The base dataset processor handles the processing logic. It needs to produce at least
 38    one valid event table or one valid matrix configuration to be successful, concluding
 39    that remaining tables are optional.
 40
 41    Abstract methods:
 42
 43    get_event_configs
 44    get_matrix_configs
 45    get_table_configs
 46
 47    Public methods:
 48
 49    run
 50    """
 51
 52    def __init__(self, dataset_dir: str, dataset_name: str):
 53        """Construct the base DatasetProcessor.
 54
 55        Args:
 56            dataset_dir: path of the dataset directory.
 57            dataset_name: name of the dataset (processor).
 58        """
 59        self.dataset_dir = dataset_dir
 60        self.dataset_name = dataset_name
 61
 62    @abstractmethod
 63    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 64        """Get event table configuration processors.
 65
 66        Returns:
 67            a list of tuples consisting of the event table name and the event table processor.
 68        """
 69        raise NotImplementedError()
 70
 71    @abstractmethod
 72    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 73        """Get matrix configuration processors.
 74
 75        Returns:
 76            a list of tuples consisting of the matrix name and the matrix processor.
 77        """
 78        raise NotImplementedError()
 79
 80    @abstractmethod
 81    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 82        """Get table configuration processors.
 83
 84        Returns:
 85            a list of tuples consisting of the table name and the table processor.
 86        """
 87        raise NotImplementedError()
 88
 89    def run_event_table_processors(self) -> Dict[str, DatasetTableConfig]:
 90        """Run the dataset's event table processors.
 91
 92        Returns:
 93            a dictionary with valid event table name-configuration pairs.
 94        """
 95        dataset_events = {}
 96        for table_name, process_config in self.get_event_configs():
 97            config = process_config()
 98            if config is not None and config.num_records > 0:
 99                dataset_events[table_name] = config
100
101        return dataset_events
102
103    def run_matrix_table_processors(self) -> Dict[str, DatasetMatrixConfig]:
104        """Run the dataset's matrix processors.
105
106        Returns:
107            a dictionary with valid matrix name-configuration pairs.
108        """
109        dataset_matrices = {}
110        for matrix_name, process_config in self.get_matrix_configs():
111            config = process_config()
112            if config is not None and config.table.num_records > 0:
113                dataset_matrices[matrix_name] = config
114
115        return dataset_matrices
116
117    def run_table_processors(self) -> Dict[str, DatasetTableConfig]:
118        """Run the dataset's additional table processors.
119
120        Returns:
121            a dictionary with valid table name-configuration pairs.
122        """
123        dataset_tables = {}
124        for table_name, process_config in self.get_table_configs():
125            config = process_config()
126            if config is not None and config.num_records > 0:
127                dataset_tables[table_name] = config
128
129        return dataset_tables
130
131    def run(self) -> Optional[DatasetConfig]:
132        """Run the dataset configuration processor.
133
134        Returns:
135            the dataset configuration or None on failure.
136        """
137        dataset_events = self.run_event_table_processors()
138        dataset_matrices = self.run_matrix_table_processors()
139        if len(dataset_events) == 0 and len(dataset_matrices) == 0:
140            return None
141
142        return DatasetConfig(
143            self.dataset_name,
144            dataset_events,
145            dataset_matrices,
146            self.run_table_processors()
147        )
class DatasetProcessorBase:
 19class DatasetProcessorBase(metaclass=ABCMeta):
 20    """DataProcessor base class for all FairRecKit datasets.
 21
 22    Datasets are preprocessed so that they will be of a recognized standard format
 23    on the other side. A configuration file is produced in the resulting dataset
 24    directory that stores the metadata for achieving this. For further information
 25    it is advised to take a look at the Dataset(Config) class.
 26
 27    The dataset configuration mainly consists of:
 28
 29    1) event tables: contain user event tables that can be used to construct a matrix of.
 30    2) matrix tables: contain available matrices associated with the dataset.
 31    3) (other) tables: contain the shared tables associated with the dataset.
 32
 33    For each of these three categories an abstract function is exposed in order to retrieve
 34    (table name, table configuration processor) tuples. The tables names are expected to be
 35    unique across all categories. The table configuration processors are allowed to return
 36    None on failure and will be excluded from the final configuration. Moreover, tables that
 37    do not contain any records are excluded as well.
 38    The base dataset processor handles the processing logic. It needs to produce at least
 39    one valid event table or one valid matrix configuration to be successful, concluding
 40    that remaining tables are optional.
 41
 42    Abstract methods:
 43
 44    get_event_configs
 45    get_matrix_configs
 46    get_table_configs
 47
 48    Public methods:
 49
 50    run
 51    """
 52
 53    def __init__(self, dataset_dir: str, dataset_name: str):
 54        """Construct the base DatasetProcessor.
 55
 56        Args:
 57            dataset_dir: path of the dataset directory.
 58            dataset_name: name of the dataset (processor).
 59        """
 60        self.dataset_dir = dataset_dir
 61        self.dataset_name = dataset_name
 62
 63    @abstractmethod
 64    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 65        """Get event table configuration processors.
 66
 67        Returns:
 68            a list of tuples consisting of the event table name and the event table processor.
 69        """
 70        raise NotImplementedError()
 71
 72    @abstractmethod
 73    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 74        """Get matrix configuration processors.
 75
 76        Returns:
 77            a list of tuples consisting of the matrix name and the matrix processor.
 78        """
 79        raise NotImplementedError()
 80
 81    @abstractmethod
 82    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 83        """Get table configuration processors.
 84
 85        Returns:
 86            a list of tuples consisting of the table name and the table processor.
 87        """
 88        raise NotImplementedError()
 89
 90    def run_event_table_processors(self) -> Dict[str, DatasetTableConfig]:
 91        """Run the dataset's event table processors.
 92
 93        Returns:
 94            a dictionary with valid event table name-configuration pairs.
 95        """
 96        dataset_events = {}
 97        for table_name, process_config in self.get_event_configs():
 98            config = process_config()
 99            if config is not None and config.num_records > 0:
100                dataset_events[table_name] = config
101
102        return dataset_events
103
104    def run_matrix_table_processors(self) -> Dict[str, DatasetMatrixConfig]:
105        """Run the dataset's matrix processors.
106
107        Returns:
108            a dictionary with valid matrix name-configuration pairs.
109        """
110        dataset_matrices = {}
111        for matrix_name, process_config in self.get_matrix_configs():
112            config = process_config()
113            if config is not None and config.table.num_records > 0:
114                dataset_matrices[matrix_name] = config
115
116        return dataset_matrices
117
118    def run_table_processors(self) -> Dict[str, DatasetTableConfig]:
119        """Run the dataset's additional table processors.
120
121        Returns:
122            a dictionary with valid table name-configuration pairs.
123        """
124        dataset_tables = {}
125        for table_name, process_config in self.get_table_configs():
126            config = process_config()
127            if config is not None and config.num_records > 0:
128                dataset_tables[table_name] = config
129
130        return dataset_tables
131
132    def run(self) -> Optional[DatasetConfig]:
133        """Run the dataset configuration processor.
134
135        Returns:
136            the dataset configuration or None on failure.
137        """
138        dataset_events = self.run_event_table_processors()
139        dataset_matrices = self.run_matrix_table_processors()
140        if len(dataset_events) == 0 and len(dataset_matrices) == 0:
141            return None
142
143        return DatasetConfig(
144            self.dataset_name,
145            dataset_events,
146            dataset_matrices,
147            self.run_table_processors()
148        )

DataProcessor base class for all FairRecKit datasets.

Datasets are preprocessed so that they will be of a recognized standard format on the other side. A configuration file is produced in the resulting dataset directory that stores the metadata for achieving this. For further information it is advised to take a look at the Dataset(Config) class.

The dataset configuration mainly consists of:

1) event tables: contain user event tables that can be used to construct a matrix of. 2) matrix tables: contain available matrices associated with the dataset. 3) (other) tables: contain the shared tables associated with the dataset.

For each of these three categories an abstract function is exposed in order to retrieve (table name, table configuration processor) tuples. The tables names are expected to be unique across all categories. The table configuration processors are allowed to return None on failure and will be excluded from the final configuration. Moreover, tables that do not contain any records are excluded as well. The base dataset processor handles the processing logic. It needs to produce at least one valid event table or one valid matrix configuration to be successful, concluding that remaining tables are optional.

Abstract methods:

get_event_configs get_matrix_configs get_table_configs

Public methods:

run

DatasetProcessorBase(dataset_dir: str, dataset_name: str)
53    def __init__(self, dataset_dir: str, dataset_name: str):
54        """Construct the base DatasetProcessor.
55
56        Args:
57            dataset_dir: path of the dataset directory.
58            dataset_name: name of the dataset (processor).
59        """
60        self.dataset_dir = dataset_dir
61        self.dataset_name = dataset_name

Construct the base DatasetProcessor.

Args: dataset_dir: path of the dataset directory. dataset_name: name of the dataset (processor).

@abstractmethod
def get_event_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
63    @abstractmethod
64    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
65        """Get event table configuration processors.
66
67        Returns:
68            a list of tuples consisting of the event table name and the event table processor.
69        """
70        raise NotImplementedError()

Get event table configuration processors.

Returns: a list of tuples consisting of the event table name and the event table processor.

@abstractmethod
def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]:
72    @abstractmethod
73    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
74        """Get matrix configuration processors.
75
76        Returns:
77            a list of tuples consisting of the matrix name and the matrix processor.
78        """
79        raise NotImplementedError()

Get matrix configuration processors.

Returns: a list of tuples consisting of the matrix name and the matrix processor.

@abstractmethod
def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
81    @abstractmethod
82    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
83        """Get table configuration processors.
84
85        Returns:
86            a list of tuples consisting of the table name and the table processor.
87        """
88        raise NotImplementedError()

Get table configuration processors.

Returns: a list of tuples consisting of the table name and the table processor.

def run_event_table_processors( self) -> Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
 90    def run_event_table_processors(self) -> Dict[str, DatasetTableConfig]:
 91        """Run the dataset's event table processors.
 92
 93        Returns:
 94            a dictionary with valid event table name-configuration pairs.
 95        """
 96        dataset_events = {}
 97        for table_name, process_config in self.get_event_configs():
 98            config = process_config()
 99            if config is not None and config.num_records > 0:
100                dataset_events[table_name] = config
101
102        return dataset_events

Run the dataset's event table processors.

Returns: a dictionary with valid event table name-configuration pairs.

def run_matrix_table_processors( self) -> Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]:
104    def run_matrix_table_processors(self) -> Dict[str, DatasetMatrixConfig]:
105        """Run the dataset's matrix processors.
106
107        Returns:
108            a dictionary with valid matrix name-configuration pairs.
109        """
110        dataset_matrices = {}
111        for matrix_name, process_config in self.get_matrix_configs():
112            config = process_config()
113            if config is not None and config.table.num_records > 0:
114                dataset_matrices[matrix_name] = config
115
116        return dataset_matrices

Run the dataset's matrix processors.

Returns: a dictionary with valid matrix name-configuration pairs.

def run_table_processors( self) -> Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
118    def run_table_processors(self) -> Dict[str, DatasetTableConfig]:
119        """Run the dataset's additional table processors.
120
121        Returns:
122            a dictionary with valid table name-configuration pairs.
123        """
124        dataset_tables = {}
125        for table_name, process_config in self.get_table_configs():
126            config = process_config()
127            if config is not None and config.num_records > 0:
128                dataset_tables[table_name] = config
129
130        return dataset_tables

Run the dataset's additional table processors.

Returns: a dictionary with valid table name-configuration pairs.

def run( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetConfig]:
132    def run(self) -> Optional[DatasetConfig]:
133        """Run the dataset configuration processor.
134
135        Returns:
136            the dataset configuration or None on failure.
137        """
138        dataset_events = self.run_event_table_processors()
139        dataset_matrices = self.run_matrix_table_processors()
140        if len(dataset_events) == 0 and len(dataset_matrices) == 0:
141            return None
142
143        return DatasetConfig(
144            self.dataset_name,
145            dataset_events,
146            dataset_matrices,
147            self.run_table_processors()
148        )

Run the dataset configuration processor.

Returns: the dataset configuration or None on failure.