src.fairreckitlib.data.pipeline.data_config_parsing

This module contains a parser for the dataset configuration.

Functions:

parse_data_config: parse dataset matrices from the experiment configuration.
parse_data_matrix_config: parse dataset matrix configuration.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This module contains a parser for the dataset configuration.
  2
  3Functions:
  4
  5    parse_data_config: parse dataset matrices from the experiment configuration.
  6    parse_data_matrix_config: parse dataset matrix configuration.
  7
  8This program has been developed by students from the bachelor Computer Science at
  9Utrecht University within the Software Project course.
 10© Copyright Utrecht University (Department of Information and Computing Sciences)
 11"""
 12
 13from typing import Any, Dict, List, Optional, Tuple, Union
 14
 15from ...core.config.config_factories import GroupFactory
 16from ...core.events.event_dispatcher import EventDispatcher
 17from ...core.parsing.parse_assert import \
 18    assert_is_type, assert_is_container_not_empty, assert_is_key_in_dict
 19from ...core.parsing.parse_event import ON_PARSE, ParseEventArgs
 20from ..data_factory import KEY_DATA
 21from ..filter.filter_config_parsing import parse_data_subset_config
 22from ..filter.filter_constants import KEY_DATA_SUBSET
 23from ..ratings.convert_constants import KEY_RATING_CONVERTER
 24from ..ratings.convert_config_parsing import parse_data_convert_config
 25from ..set.dataset_registry import DataRegistry
 26from ..split.split_constants import KEY_SPLITTING
 27from ..split.split_config_parsing import parse_data_split_config
 28from .data_config import DataMatrixConfig
 29
 30
 31def parse_data_config(
 32        experiment_config: Dict[str, Any],
 33        data_registry: DataRegistry,
 34        data_factory: GroupFactory,
 35        event_dispatcher: EventDispatcher) -> Optional[List[DataMatrixConfig]]:
 36    """Parse all dataset configurations.
 37
 38    Args:
 39        experiment_config: the experiment's total configuration.
 40        data_registry: the data registry containing the available datasets.
 41        data_factory: factory with available data modifier factories.
 42        event_dispatcher: to dispatch the parse event on failure.
 43
 44    Returns:
 45        a list of parsed DatasetConfig's or None when empty.
 46    """
 47    # assert KEY_DATA is present
 48    if not assert_is_key_in_dict(
 49        KEY_DATA,
 50        experiment_config,
 51        event_dispatcher,
 52        'PARSE ERROR: missing experiment key \'' + KEY_DATA + '\' (required)'
 53    ): return None
 54
 55    data_matrices_config = experiment_config[KEY_DATA]
 56
 57    # assert data_matrices_config is a list
 58    if not assert_is_type(
 59        data_matrices_config,
 60        list,
 61        event_dispatcher,
 62        'PARSE ERROR: invalid experiment value for key \'' + KEY_DATA + '\''
 63    ): return None
 64
 65    # assert data_matrices_config has list entries
 66    if not assert_is_container_not_empty(
 67        data_matrices_config,
 68        event_dispatcher,
 69        'PARSE ERROR: experiment \'' + KEY_DATA + '\' is empty'
 70    ): return None
 71
 72    parsed_matrices = []
 73
 74    # parse datasets_config list entries
 75    for data_matrix_config in data_matrices_config:
 76        data_matrix, data_matrix_name = parse_data_matrix_config(
 77            data_matrix_config,
 78            data_registry,
 79            data_factory,
 80            event_dispatcher
 81        )
 82        # skip on failure
 83        if data_matrix is None:
 84            event_dispatcher.dispatch(ParseEventArgs(
 85                ON_PARSE,
 86                'PARSE WARNING: failed to parse data matrix \'' +
 87                str(data_matrix_name) + '\', skipping...'
 88            ))
 89            continue
 90
 91        parsed_matrices.append(data_matrix)
 92
 93    # final check to verify at least one data matrix got parsed
 94    if not assert_is_container_not_empty(
 95        parsed_matrices,
 96        event_dispatcher,
 97        'PARSE ERROR: missing experiment data matrices'
 98    ): return None
 99
100    return parsed_matrices
101
102
103def parse_data_matrix_config(
104        data_matrix_config: Any,
105        data_registry: DataRegistry,
106        data_factory: GroupFactory,
107        event_dispatcher: EventDispatcher) -> Union[Tuple[DataMatrixConfig, str],Tuple[None, None]]:
108    """Parse a data matrix configuration.
109
110    Args:
111        data_matrix_config: the data matrix configuration.
112        data_registry: the data registry containing the available datasets.
113        data_factory: factory with available data modifier factories.
114        event_dispatcher: to dispatch the parse event on failure.
115
116    Returns:
117        parsed_config: the parsed configuration or None on failure.
118        dataset_name: the name of the parsed dataset or None on failure.
119    """
120    # assert data_matrix_config is a dict
121    if not assert_is_type(
122        data_matrix_config,
123        dict,
124        event_dispatcher,
125        'PARSE ERROR: invalid dataset matrix entry'
126    ): return None, None
127
128    dataset_subset, dataset_matrix_name = parse_data_subset_config(
129        data_matrix_config,
130        data_registry,
131        data_factory.get_factory(KEY_DATA_SUBSET),
132        event_dispatcher
133    )
134    if not dataset_subset:
135        return None, dataset_matrix_name
136
137    dataset = data_registry.get_set(dataset_subset.dataset)
138
139    # parse dataset matrix rating converter
140    dataset_rating_modifier = parse_data_convert_config(
141        data_matrix_config,
142        dataset,
143        dataset_subset.matrix,
144        data_factory.get_factory(KEY_RATING_CONVERTER),
145        event_dispatcher
146    )
147
148    # parse dataset matrix splitter
149    dataset_splitting = parse_data_split_config(
150        data_matrix_config,
151        dataset,
152        dataset_subset.matrix,
153        data_factory.get_factory(KEY_SPLITTING),
154        event_dispatcher
155    )
156
157    parsed_config = DataMatrixConfig(
158        dataset_subset.dataset,
159        dataset_subset.matrix,
160        dataset_subset.filter_passes,
161        dataset_rating_modifier,
162        dataset_splitting
163    )
164
165    return parsed_config, dataset_matrix_name
def parse_data_config( experiment_config: Dict[str, Any], data_registry: src.fairreckitlib.data.set.dataset_registry.DataRegistry, data_factory: src.fairreckitlib.core.config.config_factories.GroupFactory, event_dispatcher: src.fairreckitlib.core.events.event_dispatcher.EventDispatcher) -> Optional[List[src.fairreckitlib.data.pipeline.data_config.DataMatrixConfig]]:
 32def parse_data_config(
 33        experiment_config: Dict[str, Any],
 34        data_registry: DataRegistry,
 35        data_factory: GroupFactory,
 36        event_dispatcher: EventDispatcher) -> Optional[List[DataMatrixConfig]]:
 37    """Parse all dataset configurations.
 38
 39    Args:
 40        experiment_config: the experiment's total configuration.
 41        data_registry: the data registry containing the available datasets.
 42        data_factory: factory with available data modifier factories.
 43        event_dispatcher: to dispatch the parse event on failure.
 44
 45    Returns:
 46        a list of parsed DatasetConfig's or None when empty.
 47    """
 48    # assert KEY_DATA is present
 49    if not assert_is_key_in_dict(
 50        KEY_DATA,
 51        experiment_config,
 52        event_dispatcher,
 53        'PARSE ERROR: missing experiment key \'' + KEY_DATA + '\' (required)'
 54    ): return None
 55
 56    data_matrices_config = experiment_config[KEY_DATA]
 57
 58    # assert data_matrices_config is a list
 59    if not assert_is_type(
 60        data_matrices_config,
 61        list,
 62        event_dispatcher,
 63        'PARSE ERROR: invalid experiment value for key \'' + KEY_DATA + '\''
 64    ): return None
 65
 66    # assert data_matrices_config has list entries
 67    if not assert_is_container_not_empty(
 68        data_matrices_config,
 69        event_dispatcher,
 70        'PARSE ERROR: experiment \'' + KEY_DATA + '\' is empty'
 71    ): return None
 72
 73    parsed_matrices = []
 74
 75    # parse datasets_config list entries
 76    for data_matrix_config in data_matrices_config:
 77        data_matrix, data_matrix_name = parse_data_matrix_config(
 78            data_matrix_config,
 79            data_registry,
 80            data_factory,
 81            event_dispatcher
 82        )
 83        # skip on failure
 84        if data_matrix is None:
 85            event_dispatcher.dispatch(ParseEventArgs(
 86                ON_PARSE,
 87                'PARSE WARNING: failed to parse data matrix \'' +
 88                str(data_matrix_name) + '\', skipping...'
 89            ))
 90            continue
 91
 92        parsed_matrices.append(data_matrix)
 93
 94    # final check to verify at least one data matrix got parsed
 95    if not assert_is_container_not_empty(
 96        parsed_matrices,
 97        event_dispatcher,
 98        'PARSE ERROR: missing experiment data matrices'
 99    ): return None
100
101    return parsed_matrices

Parse all dataset configurations.

Args: experiment_config: the experiment's total configuration. data_registry: the data registry containing the available datasets. data_factory: factory with available data modifier factories. event_dispatcher: to dispatch the parse event on failure.

Returns: a list of parsed DatasetConfig's or None when empty.

def parse_data_matrix_config( data_matrix_config: Any, data_registry: src.fairreckitlib.data.set.dataset_registry.DataRegistry, data_factory: src.fairreckitlib.core.config.config_factories.GroupFactory, event_dispatcher: src.fairreckitlib.core.events.event_dispatcher.EventDispatcher) -> Union[Tuple[src.fairreckitlib.data.pipeline.data_config.DataMatrixConfig, str], Tuple[NoneType, NoneType]]:
104def parse_data_matrix_config(
105        data_matrix_config: Any,
106        data_registry: DataRegistry,
107        data_factory: GroupFactory,
108        event_dispatcher: EventDispatcher) -> Union[Tuple[DataMatrixConfig, str],Tuple[None, None]]:
109    """Parse a data matrix configuration.
110
111    Args:
112        data_matrix_config: the data matrix configuration.
113        data_registry: the data registry containing the available datasets.
114        data_factory: factory with available data modifier factories.
115        event_dispatcher: to dispatch the parse event on failure.
116
117    Returns:
118        parsed_config: the parsed configuration or None on failure.
119        dataset_name: the name of the parsed dataset or None on failure.
120    """
121    # assert data_matrix_config is a dict
122    if not assert_is_type(
123        data_matrix_config,
124        dict,
125        event_dispatcher,
126        'PARSE ERROR: invalid dataset matrix entry'
127    ): return None, None
128
129    dataset_subset, dataset_matrix_name = parse_data_subset_config(
130        data_matrix_config,
131        data_registry,
132        data_factory.get_factory(KEY_DATA_SUBSET),
133        event_dispatcher
134    )
135    if not dataset_subset:
136        return None, dataset_matrix_name
137
138    dataset = data_registry.get_set(dataset_subset.dataset)
139
140    # parse dataset matrix rating converter
141    dataset_rating_modifier = parse_data_convert_config(
142        data_matrix_config,
143        dataset,
144        dataset_subset.matrix,
145        data_factory.get_factory(KEY_RATING_CONVERTER),
146        event_dispatcher
147    )
148
149    # parse dataset matrix splitter
150    dataset_splitting = parse_data_split_config(
151        data_matrix_config,
152        dataset,
153        dataset_subset.matrix,
154        data_factory.get_factory(KEY_SPLITTING),
155        event_dispatcher
156    )
157
158    parsed_config = DataMatrixConfig(
159        dataset_subset.dataset,
160        dataset_subset.matrix,
161        dataset_subset.filter_passes,
162        dataset_rating_modifier,
163        dataset_splitting
164    )
165
166    return parsed_config, dataset_matrix_name

Parse a data matrix configuration.

Args: data_matrix_config: the data matrix configuration. data_registry: the data registry containing the available datasets. data_factory: factory with available data modifier factories. event_dispatcher: to dispatch the parse event on failure.

Returns: parsed_config: the parsed configuration or None on failure. dataset_name: the name of the parsed dataset or None on failure.