src.fairreckitlib.data.set.dataset_config

This module contains the dataset configuration classes and creation functions.

Constants:

DATASET_RATINGS_EXPLICIT: dataset matrix with explicit ratings.
DATASET_RATINGS_IMPLICIT: dataset matrix with implicit ratings.

Classes:

DatasetFileConfig: the configuration of a dataset file.
DatasetTableConfig: the configuration of a dataset table.
DatasetIndexConfig: the configuration of a dataset matrix' user/item indices.
DatasetMatrixConfig: the configuration of a dataset matrix.
DatasetConfig: the configuration of a dataset.

Functions:

create_dataset_table_config: create configuration for a dataset table.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This module contains the dataset configuration classes and creation functions.
  2
  3Constants:
  4
  5    DATASET_RATINGS_EXPLICIT: dataset matrix with explicit ratings.
  6    DATASET_RATINGS_IMPLICIT: dataset matrix with implicit ratings.
  7
  8Classes:
  9
 10    DatasetFileConfig: the configuration of a dataset file.
 11    DatasetTableConfig: the configuration of a dataset table.
 12    DatasetIndexConfig: the configuration of a dataset matrix' user/item indices.
 13    DatasetMatrixConfig: the configuration of a dataset matrix.
 14    DatasetConfig: the configuration of a dataset.
 15
 16Functions:
 17
 18    create_dataset_table_config: create configuration for a dataset table.
 19
 20This program has been developed by students from the bachelor Computer Science at
 21Utrecht University within the Software Project course.
 22© Copyright Utrecht University (Department of Information and Computing Sciences)
 23"""
 24
 25from dataclasses import dataclass
 26import os
 27from typing import Any, Dict, List, Optional, Union
 28
 29import pandas as pd
 30
 31from ...core.core_constants import KEY_NAME
 32from ...core.config.config_yml import YmlConfig, format_yml_config_dict
 33from ...core.io.io_utility import load_array_from_hdf5, save_array_to_hdf5
 34from .dataset_constants import KEY_RATING_MIN, KEY_RATING_MAX, KEY_RATING_TYPE
 35from .dataset_constants import KEY_MATRIX, KEY_IDX_ITEM, KEY_IDX_USER
 36from .dataset_constants import KEY_DATASET, KEY_EVENTS, KEY_MATRICES, KEY_TABLES
 37from .dataset_constants import TABLE_KEY, TABLE_PRIMARY_KEY, TABLE_FOREIGN_KEYS, TABLE_COLUMNS
 38from .dataset_constants import TABLE_FILE, TABLE_COMPRESSION, TABLE_ENCODING
 39from .dataset_constants import TABLE_HEADER, TABLE_NUM_RECORDS, TABLE_SEP
 40
 41DATASET_RATINGS_EXPLICIT = 'explicit'
 42DATASET_RATINGS_IMPLICIT = 'implicit'
 43
 44
 45@dataclass
 46class FileOptionsConfig(YmlConfig):
 47    r"""File Options Configuration.
 48
 49    sep: the separator in the file or None for \t.
 50    compression: the (optional) compression of the file.
 51    encoding: the encoding of the file or None for 'utf-8'.
 52    header: is there a header on the first line of the file.
 53    """
 54
 55    sep: Optional[str]
 56    compression: Optional[str]
 57    encoding: Optional[str]
 58    header: bool
 59
 60    def to_yml_format(self):
 61        """Format file settings configuration to a yml compatible dictionary.
 62
 63        Returns:
 64            a dictionary containing the dataset file configuration.
 65        """
 66        yml_format = {}
 67
 68        if self.header:
 69            yml_format[TABLE_HEADER] = self.header
 70        if self.sep is not None:
 71            yml_format[TABLE_SEP] = self.sep
 72        if self.compression is not None:
 73            yml_format[TABLE_COMPRESSION] = self.compression
 74        if self.encoding is not None:
 75            yml_format[TABLE_ENCODING] = self.encoding
 76
 77        return yml_format
 78
 79
 80@dataclass
 81class DatasetFileConfig(YmlConfig):
 82    """Dataset File Configuration.
 83
 84    name: the file name.
 85    options: the file options.
 86    """
 87
 88    name: str
 89    options: FileOptionsConfig
 90
 91    def to_yml_format(self):
 92        """Format dataset file configuration to a yml compatible dictionary.
 93
 94        Returns:
 95            a dictionary containing the dataset file configuration.
 96        """
 97        yml_format = {KEY_NAME: self.name}
 98        yml_format.update(self.options.to_yml_format())
 99        return yml_format
100
101
102@dataclass
103class DatasetTableConfig(YmlConfig):
104    """Dataset Table Configuration.
105
106    The configuration expects the table to have the primary key as the first column(s) and in order
107    in which they are specified. These are followed by the columns of relevant data and any foreign
108    keys should be in order as the last column(s). However, it is also allowed that the foreign
109    keys describe the primary key, but individually rather than the combination of.
110    Moreover, both the values of the primary key and columns are expected to be splittable into two
111    based on the DATASET_SPLIT_DELIMITER. The left split is the prefix (in case of duplicate names
112    across tables) and the right split is the name of the key/column (e.g. user_id and item_id).
113
114    primary_key: list of column names that form the primary key of the table.
115    foreign_keys: (optional) list of column names that are foreign keys in other tables.
116    columns: list of column names that contain the relevant table data.
117    num_records: the number of records in the table.
118    file: the dataset file configuration of the table.
119    """
120
121    primary_key: List[str]
122    foreign_keys: Optional[List[str]]
123    columns: List[str]
124    num_records: int
125    file: DatasetFileConfig
126
127    def read_table(
128            self,
129            dataset_dir: str,
130            *,
131            columns: List[Union[str, int]]=None,
132            chunk_size=None) -> pd.DataFrame:
133        """Read the table from the specified directory.
134
135        Args:
136            dataset_dir: the directory to read the table from.
137            columns: subset list of columns to load or None to load all.
138                All elements must either be integer indices or
139                strings that correspond to the 'names' argument.
140            chunk_size: loads the table in chunks as an iterator or
141                the entire table when None.
142
143        Returns:
144            the resulting table (iterator).
145        """
146        names = self.primary_key + self.columns
147        if self.foreign_keys is not None:
148            names += [key for key in self.foreign_keys if key not in self.primary_key]
149
150        dataset_table = pd.read_table(
151            os.path.join(dataset_dir, self.file.name),
152            sep=self.file.options.sep if self.file.options.sep is not None else '\t',
153            header=0 if self.file.options.header else None,
154            names=names,
155            usecols=columns,
156            encoding=self.file.options.encoding
157            if self.file.options.encoding is not None else 'utf-8',
158            compression=self.file.options.compression
159            if self.file.options.compression is not None else 'infer',
160            chunksize=chunk_size,
161            iterator=bool(chunk_size)
162        )
163
164        return dataset_table
165
166    def save_table(self, dataset_table: pd.DataFrame, dataset_dir: str) -> None:
167        """Save the table in the specified directory.
168
169        Args:
170            dataset_table: the dataframe to save with this table configuration.
171            dataset_dir: the directory to save the table to.
172        """
173        dataset_table.to_csv(
174            os.path.join(dataset_dir, self.file.name),
175            sep=self.file.options.sep if self.file.options.sep else '\t',
176            header=self.file.options.header,
177            index=False,
178            encoding=self.file.options.encoding
179            if self.file.options.encoding else 'utf-8',
180            compression=self.file.options.compression
181            if self.file.options.compression else 'infer',
182        )
183
184    def to_yml_format(self) -> Dict[str, Any]:
185        """Format dataset table configuration to a yml compatible dictionary.
186
187        Returns:
188            a dictionary containing the dataset table configuration.
189        """
190        yml_format = {
191            TABLE_FILE: self.file.to_yml_format(),
192            TABLE_PRIMARY_KEY: self.primary_key,
193            TABLE_COLUMNS: self.columns,
194            TABLE_NUM_RECORDS: self.num_records,
195        }
196
197        if self.foreign_keys is not None:
198            yml_format[TABLE_FOREIGN_KEYS] = self.foreign_keys
199
200        return yml_format
201
202
203@dataclass
204class DatasetIndexConfig(YmlConfig):
205    """Dataset Matrix' Index Configuration.
206
207    file_name: (optional) file name that contains the user/item indirection array.
208    key: the key that is associated with the user/item.
209    num_records: the number of user/item records
210    """
211
212    file_name: Optional[str]
213    key: str
214    num_records: int
215
216    def load_indices(self, dataset_dir: str) -> Optional[List[int]]:
217        """Load the indices from the specified directory.
218
219        This function raises a FileNotFoundError when the file is not
220        found in the specified directory.
221
222        Args:
223            dataset_dir: the directory to load the indices from.
224
225        Returns:
226            the resulting indices or None when not available.
227        """
228        if self.file_name is None:
229            return None
230
231        return load_array_from_hdf5(os.path.join(dataset_dir, self.file_name), 'indices')
232
233    def save_indices(self, dataset_dir: str, indices: List[int]) -> bool:
234        """Save the indices to the specified directory.
235
236        Args:
237            dataset_dir: the directory to save the indices to.
238            indices: the list of indices to save.
239
240        Returns:
241            true when the indices are saved or false when the configuration has no file name.
242        """
243        if self.file_name is None:
244            return False
245
246        save_array_to_hdf5(os.path.join(dataset_dir, self.file_name), indices, 'indices')
247        return True
248
249    def to_yml_format(self) -> Dict[str, Any]:
250        """Format dataset index configuration to a yml compatible dictionary.
251
252        Returns:
253            a dictionary containing the dataset index configuration.
254        """
255        yml_format = {
256            TABLE_KEY: self.key,
257            TABLE_NUM_RECORDS: self.num_records
258        }
259
260        if self.file_name is not None:
261            yml_format[TABLE_FILE] = self.file_name
262
263        return yml_format
264
265
266@dataclass
267class RatingMatrixConfig(YmlConfig):
268    """Rating Matrix Configuration.
269
270    rating_min: the minimum rating in the matrix.
271    rating_max: the maximum rating in the matrix.
272    rating_type: the type of the rating in the matrix, either 'explicit' or 'implicit'.
273    """
274
275    rating_min: float
276    rating_max: float
277    rating_type: str
278
279    def to_yml_format(self) -> Dict[str, Any]:
280        """Format rating matrix configuration to a yml compatible dictionary.
281
282        Returns:
283            a dictionary containing the dataset matrix configuration.
284        """
285        return {
286            KEY_RATING_MIN: self.rating_min,
287            KEY_RATING_MAX: self.rating_max,
288            KEY_RATING_TYPE: self.rating_type,
289        }
290
291
292
293@dataclass
294class DatasetMatrixConfig(YmlConfig):
295    """Dataset Matrix Configuration.
296
297    table: the table configuration of the matrix.
298    rating_matrix: the ratings of the matrix.
299    user: the dataset index configuration for the users in the matrix.
300    item: the dataset index configuration for the items in the matrix.
301    """
302
303    table: DatasetTableConfig
304    ratings: RatingMatrixConfig
305    user: DatasetIndexConfig
306    item: DatasetIndexConfig
307
308    def load_matrix(self, dataset_dir: str) -> pd.DataFrame:
309        """Load the matrix from the specified directory.
310
311        Args:
312            dataset_dir: directory path to where the dataset matrix is stored.
313
314        Returns:
315            the resulting matrix (iterator).
316        """
317        matrix = self.table.read_table(dataset_dir)
318
319        columns = {
320            self.user.key: 'user',
321            self.item.key: 'item',
322            self.table.columns[0]: 'rating'
323        }
324        if len(self.table.columns) == 2:
325            columns[self.table.columns[1]] = 'timestamp'
326
327        return matrix.rename(columns=columns)
328
329    def to_yml_format(self) -> Dict[str, Any]:
330        """Format dataset matrix configuration to a yml compatible dictionary.
331
332        Returns:
333            a dictionary containing the dataset matrix configuration.
334        """
335        yml_format = self.ratings.to_yml_format()
336        yml_format.update({
337            KEY_IDX_ITEM: self.item.to_yml_format(),
338            KEY_IDX_USER: self.user.to_yml_format(),
339            KEY_MATRIX: self.table.to_yml_format()
340        })
341        return yml_format
342
343
344@dataclass
345class DatasetConfig(YmlConfig):
346    """Dataset Configuration.
347
348    dataset_name: the name of the dataset.
349    events: dictionary containing the available user event tables.
350    matrices: dictionary containing the available user-item matrices.
351    tables: dictionary containing the (additionally) available tables.
352    """
353
354    dataset_name: str
355    events: Dict[str, DatasetTableConfig]
356    matrices: Dict[str, DatasetMatrixConfig]
357    tables: Dict[str, DatasetTableConfig]
358
359    def get_available_columns(self, matrix_name: str) -> Dict[str, List[str]]:
360        """Get the available columns of the specified matrix.
361
362        Only the table names and columns that have a one-to-one relation will be returned.
363        This function does not raise errors and will return an empty dictionary when
364        the specified matrix is not present in the dataset.
365
366        Args:
367            matrix_name: the name of the matrix to get the available columns of.
368
369        Returns:
370            a dictionary containing the table names as key and the available columns as value.
371        """
372        if matrix_name not in self.matrices:
373            return {}
374
375        matrix_config = self.matrices[matrix_name]
376        result = {KEY_MATRIX: matrix_config.table.columns}
377
378        user_key = [matrix_config.user.key]
379        item_key = [matrix_config.item.key]
380        user_item_key = [user_key, item_key]
381
382        for table_name, table_config in self.tables.items():
383            key = table_config.primary_key
384            if key in (user_key, item_key, user_item_key):
385                result[table_name] = table_config.columns
386
387        return result
388
389    def to_yml_format(self) -> Dict[str, Any]:
390        """Format dataset configuration to a yml compatible dictionary.
391
392        Returns:
393            a dictionary containing the dataset configuration.
394        """
395        yml_format = {KEY_DATASET: self.dataset_name}
396
397        if len(self.events) > 0:
398            yml_format[KEY_EVENTS] = format_yml_config_dict(self.events)
399        if len(self.matrices) > 0:
400            yml_format[KEY_MATRICES] = format_yml_config_dict(self.matrices)
401        if len(self.tables) > 0:
402            yml_format[KEY_TABLES] = format_yml_config_dict(self.tables)
403
404        return yml_format
405
406
407def create_dataset_table_config(
408        file_name: str,
409        primary_key: List[str],
410        columns: List[str],
411        *,
412        compression: str=None,
413        encoding: str=None,
414        foreign_keys: List[str]=None,
415        header: bool=False,
416        num_records: int=0,
417        sep: str=None) -> DatasetTableConfig:
418    """Create a dataset table configuration.
419
420    Args:
421        file_name: name of the dataset table file.
422        primary_key: a list of strings that are combined the primary key of the table.
423        columns: a list of strings with other available columns in the table.
424        compression:  the (optional) compression of the file, 'bz2' is recommended.
425        encoding: the encoding for reading/writing the table contents or None for 'utf-8'.
426        foreign_keys: (optional) list of column names that are foreign keys in other tables.
427        header: whether the table file contains a header on the first line.
428        num_records: the number of records in the table.
429        sep: the delimiter that is used in the table or None for a tab separator.
430
431    Returns:
432        the resulting data table configuration.
433    """
434    return DatasetTableConfig(
435        primary_key,
436        foreign_keys,
437        columns,
438        num_records,
439        DatasetFileConfig(
440            file_name,
441            FileOptionsConfig(
442                sep,
443                compression,
444                encoding,
445                header
446            )
447        )
448    )
@dataclass
class FileOptionsConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
46@dataclass
47class FileOptionsConfig(YmlConfig):
48    r"""File Options Configuration.
49
50    sep: the separator in the file or None for \t.
51    compression: the (optional) compression of the file.
52    encoding: the encoding of the file or None for 'utf-8'.
53    header: is there a header on the first line of the file.
54    """
55
56    sep: Optional[str]
57    compression: Optional[str]
58    encoding: Optional[str]
59    header: bool
60
61    def to_yml_format(self):
62        """Format file settings configuration to a yml compatible dictionary.
63
64        Returns:
65            a dictionary containing the dataset file configuration.
66        """
67        yml_format = {}
68
69        if self.header:
70            yml_format[TABLE_HEADER] = self.header
71        if self.sep is not None:
72            yml_format[TABLE_SEP] = self.sep
73        if self.compression is not None:
74            yml_format[TABLE_COMPRESSION] = self.compression
75        if self.encoding is not None:
76            yml_format[TABLE_ENCODING] = self.encoding
77
78        return yml_format

File Options Configuration.

sep: the separator in the file or None for \t. compression: the (optional) compression of the file. encoding: the encoding of the file or None for 'utf-8'. header: is there a header on the first line of the file.

FileOptionsConfig( sep: Optional[str], compression: Optional[str], encoding: Optional[str], header: bool)
def to_yml_format(self)
61    def to_yml_format(self):
62        """Format file settings configuration to a yml compatible dictionary.
63
64        Returns:
65            a dictionary containing the dataset file configuration.
66        """
67        yml_format = {}
68
69        if self.header:
70            yml_format[TABLE_HEADER] = self.header
71        if self.sep is not None:
72            yml_format[TABLE_SEP] = self.sep
73        if self.compression is not None:
74            yml_format[TABLE_COMPRESSION] = self.compression
75        if self.encoding is not None:
76            yml_format[TABLE_ENCODING] = self.encoding
77
78        return yml_format

Format file settings configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset file configuration.

@dataclass
class DatasetFileConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
 81@dataclass
 82class DatasetFileConfig(YmlConfig):
 83    """Dataset File Configuration.
 84
 85    name: the file name.
 86    options: the file options.
 87    """
 88
 89    name: str
 90    options: FileOptionsConfig
 91
 92    def to_yml_format(self):
 93        """Format dataset file configuration to a yml compatible dictionary.
 94
 95        Returns:
 96            a dictionary containing the dataset file configuration.
 97        """
 98        yml_format = {KEY_NAME: self.name}
 99        yml_format.update(self.options.to_yml_format())
100        return yml_format

Dataset File Configuration.

name: the file name. options: the file options.

DatasetFileConfig( name: str, options: src.fairreckitlib.data.set.dataset_config.FileOptionsConfig)
def to_yml_format(self)
 92    def to_yml_format(self):
 93        """Format dataset file configuration to a yml compatible dictionary.
 94
 95        Returns:
 96            a dictionary containing the dataset file configuration.
 97        """
 98        yml_format = {KEY_NAME: self.name}
 99        yml_format.update(self.options.to_yml_format())
100        return yml_format

Format dataset file configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset file configuration.

@dataclass
class DatasetTableConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
103@dataclass
104class DatasetTableConfig(YmlConfig):
105    """Dataset Table Configuration.
106
107    The configuration expects the table to have the primary key as the first column(s) and in order
108    in which they are specified. These are followed by the columns of relevant data and any foreign
109    keys should be in order as the last column(s). However, it is also allowed that the foreign
110    keys describe the primary key, but individually rather than the combination of.
111    Moreover, both the values of the primary key and columns are expected to be splittable into two
112    based on the DATASET_SPLIT_DELIMITER. The left split is the prefix (in case of duplicate names
113    across tables) and the right split is the name of the key/column (e.g. user_id and item_id).
114
115    primary_key: list of column names that form the primary key of the table.
116    foreign_keys: (optional) list of column names that are foreign keys in other tables.
117    columns: list of column names that contain the relevant table data.
118    num_records: the number of records in the table.
119    file: the dataset file configuration of the table.
120    """
121
122    primary_key: List[str]
123    foreign_keys: Optional[List[str]]
124    columns: List[str]
125    num_records: int
126    file: DatasetFileConfig
127
128    def read_table(
129            self,
130            dataset_dir: str,
131            *,
132            columns: List[Union[str, int]]=None,
133            chunk_size=None) -> pd.DataFrame:
134        """Read the table from the specified directory.
135
136        Args:
137            dataset_dir: the directory to read the table from.
138            columns: subset list of columns to load or None to load all.
139                All elements must either be integer indices or
140                strings that correspond to the 'names' argument.
141            chunk_size: loads the table in chunks as an iterator or
142                the entire table when None.
143
144        Returns:
145            the resulting table (iterator).
146        """
147        names = self.primary_key + self.columns
148        if self.foreign_keys is not None:
149            names += [key for key in self.foreign_keys if key not in self.primary_key]
150
151        dataset_table = pd.read_table(
152            os.path.join(dataset_dir, self.file.name),
153            sep=self.file.options.sep if self.file.options.sep is not None else '\t',
154            header=0 if self.file.options.header else None,
155            names=names,
156            usecols=columns,
157            encoding=self.file.options.encoding
158            if self.file.options.encoding is not None else 'utf-8',
159            compression=self.file.options.compression
160            if self.file.options.compression is not None else 'infer',
161            chunksize=chunk_size,
162            iterator=bool(chunk_size)
163        )
164
165        return dataset_table
166
167    def save_table(self, dataset_table: pd.DataFrame, dataset_dir: str) -> None:
168        """Save the table in the specified directory.
169
170        Args:
171            dataset_table: the dataframe to save with this table configuration.
172            dataset_dir: the directory to save the table to.
173        """
174        dataset_table.to_csv(
175            os.path.join(dataset_dir, self.file.name),
176            sep=self.file.options.sep if self.file.options.sep else '\t',
177            header=self.file.options.header,
178            index=False,
179            encoding=self.file.options.encoding
180            if self.file.options.encoding else 'utf-8',
181            compression=self.file.options.compression
182            if self.file.options.compression else 'infer',
183        )
184
185    def to_yml_format(self) -> Dict[str, Any]:
186        """Format dataset table configuration to a yml compatible dictionary.
187
188        Returns:
189            a dictionary containing the dataset table configuration.
190        """
191        yml_format = {
192            TABLE_FILE: self.file.to_yml_format(),
193            TABLE_PRIMARY_KEY: self.primary_key,
194            TABLE_COLUMNS: self.columns,
195            TABLE_NUM_RECORDS: self.num_records,
196        }
197
198        if self.foreign_keys is not None:
199            yml_format[TABLE_FOREIGN_KEYS] = self.foreign_keys
200
201        return yml_format

Dataset Table Configuration.

The configuration expects the table to have the primary key as the first column(s) and in order in which they are specified. These are followed by the columns of relevant data and any foreign keys should be in order as the last column(s). However, it is also allowed that the foreign keys describe the primary key, but individually rather than the combination of. Moreover, both the values of the primary key and columns are expected to be splittable into two based on the DATASET_SPLIT_DELIMITER. The left split is the prefix (in case of duplicate names across tables) and the right split is the name of the key/column (e.g. user_id and item_id).

primary_key: list of column names that form the primary key of the table. foreign_keys: (optional) list of column names that are foreign keys in other tables. columns: list of column names that contain the relevant table data. num_records: the number of records in the table. file: the dataset file configuration of the table.

DatasetTableConfig( primary_key: List[str], foreign_keys: Optional[List[str]], columns: List[str], num_records: int, file: src.fairreckitlib.data.set.dataset_config.DatasetFileConfig)
def read_table( self, dataset_dir: str, *, columns: List[Union[str, int]] = None, chunk_size=None) -> pandas.core.frame.DataFrame:
128    def read_table(
129            self,
130            dataset_dir: str,
131            *,
132            columns: List[Union[str, int]]=None,
133            chunk_size=None) -> pd.DataFrame:
134        """Read the table from the specified directory.
135
136        Args:
137            dataset_dir: the directory to read the table from.
138            columns: subset list of columns to load or None to load all.
139                All elements must either be integer indices or
140                strings that correspond to the 'names' argument.
141            chunk_size: loads the table in chunks as an iterator or
142                the entire table when None.
143
144        Returns:
145            the resulting table (iterator).
146        """
147        names = self.primary_key + self.columns
148        if self.foreign_keys is not None:
149            names += [key for key in self.foreign_keys if key not in self.primary_key]
150
151        dataset_table = pd.read_table(
152            os.path.join(dataset_dir, self.file.name),
153            sep=self.file.options.sep if self.file.options.sep is not None else '\t',
154            header=0 if self.file.options.header else None,
155            names=names,
156            usecols=columns,
157            encoding=self.file.options.encoding
158            if self.file.options.encoding is not None else 'utf-8',
159            compression=self.file.options.compression
160            if self.file.options.compression is not None else 'infer',
161            chunksize=chunk_size,
162            iterator=bool(chunk_size)
163        )
164
165        return dataset_table

Read the table from the specified directory.

Args: dataset_dir: the directory to read the table from. columns: subset list of columns to load or None to load all. All elements must either be integer indices or strings that correspond to the 'names' argument. chunk_size: loads the table in chunks as an iterator or the entire table when None.

Returns: the resulting table (iterator).

def save_table( self, dataset_table: pandas.core.frame.DataFrame, dataset_dir: str) -> None:
167    def save_table(self, dataset_table: pd.DataFrame, dataset_dir: str) -> None:
168        """Save the table in the specified directory.
169
170        Args:
171            dataset_table: the dataframe to save with this table configuration.
172            dataset_dir: the directory to save the table to.
173        """
174        dataset_table.to_csv(
175            os.path.join(dataset_dir, self.file.name),
176            sep=self.file.options.sep if self.file.options.sep else '\t',
177            header=self.file.options.header,
178            index=False,
179            encoding=self.file.options.encoding
180            if self.file.options.encoding else 'utf-8',
181            compression=self.file.options.compression
182            if self.file.options.compression else 'infer',
183        )

Save the table in the specified directory.

Args: dataset_table: the dataframe to save with this table configuration. dataset_dir: the directory to save the table to.

def to_yml_format(self) -> Dict[str, Any]:
185    def to_yml_format(self) -> Dict[str, Any]:
186        """Format dataset table configuration to a yml compatible dictionary.
187
188        Returns:
189            a dictionary containing the dataset table configuration.
190        """
191        yml_format = {
192            TABLE_FILE: self.file.to_yml_format(),
193            TABLE_PRIMARY_KEY: self.primary_key,
194            TABLE_COLUMNS: self.columns,
195            TABLE_NUM_RECORDS: self.num_records,
196        }
197
198        if self.foreign_keys is not None:
199            yml_format[TABLE_FOREIGN_KEYS] = self.foreign_keys
200
201        return yml_format

Format dataset table configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset table configuration.

@dataclass
class DatasetIndexConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
204@dataclass
205class DatasetIndexConfig(YmlConfig):
206    """Dataset Matrix' Index Configuration.
207
208    file_name: (optional) file name that contains the user/item indirection array.
209    key: the key that is associated with the user/item.
210    num_records: the number of user/item records
211    """
212
213    file_name: Optional[str]
214    key: str
215    num_records: int
216
217    def load_indices(self, dataset_dir: str) -> Optional[List[int]]:
218        """Load the indices from the specified directory.
219
220        This function raises a FileNotFoundError when the file is not
221        found in the specified directory.
222
223        Args:
224            dataset_dir: the directory to load the indices from.
225
226        Returns:
227            the resulting indices or None when not available.
228        """
229        if self.file_name is None:
230            return None
231
232        return load_array_from_hdf5(os.path.join(dataset_dir, self.file_name), 'indices')
233
234    def save_indices(self, dataset_dir: str, indices: List[int]) -> bool:
235        """Save the indices to the specified directory.
236
237        Args:
238            dataset_dir: the directory to save the indices to.
239            indices: the list of indices to save.
240
241        Returns:
242            true when the indices are saved or false when the configuration has no file name.
243        """
244        if self.file_name is None:
245            return False
246
247        save_array_to_hdf5(os.path.join(dataset_dir, self.file_name), indices, 'indices')
248        return True
249
250    def to_yml_format(self) -> Dict[str, Any]:
251        """Format dataset index configuration to a yml compatible dictionary.
252
253        Returns:
254            a dictionary containing the dataset index configuration.
255        """
256        yml_format = {
257            TABLE_KEY: self.key,
258            TABLE_NUM_RECORDS: self.num_records
259        }
260
261        if self.file_name is not None:
262            yml_format[TABLE_FILE] = self.file_name
263
264        return yml_format

Dataset Matrix' Index Configuration.

file_name: (optional) file name that contains the user/item indirection array. key: the key that is associated with the user/item. num_records: the number of user/item records

DatasetIndexConfig(file_name: Optional[str], key: str, num_records: int)
def load_indices(self, dataset_dir: str) -> Optional[List[int]]:
217    def load_indices(self, dataset_dir: str) -> Optional[List[int]]:
218        """Load the indices from the specified directory.
219
220        This function raises a FileNotFoundError when the file is not
221        found in the specified directory.
222
223        Args:
224            dataset_dir: the directory to load the indices from.
225
226        Returns:
227            the resulting indices or None when not available.
228        """
229        if self.file_name is None:
230            return None
231
232        return load_array_from_hdf5(os.path.join(dataset_dir, self.file_name), 'indices')

Load the indices from the specified directory.

This function raises a FileNotFoundError when the file is not found in the specified directory.

Args: dataset_dir: the directory to load the indices from.

Returns: the resulting indices or None when not available.

def save_indices(self, dataset_dir: str, indices: List[int]) -> bool:
234    def save_indices(self, dataset_dir: str, indices: List[int]) -> bool:
235        """Save the indices to the specified directory.
236
237        Args:
238            dataset_dir: the directory to save the indices to.
239            indices: the list of indices to save.
240
241        Returns:
242            true when the indices are saved or false when the configuration has no file name.
243        """
244        if self.file_name is None:
245            return False
246
247        save_array_to_hdf5(os.path.join(dataset_dir, self.file_name), indices, 'indices')
248        return True

Save the indices to the specified directory.

Args: dataset_dir: the directory to save the indices to. indices: the list of indices to save.

Returns: true when the indices are saved or false when the configuration has no file name.

def to_yml_format(self) -> Dict[str, Any]:
250    def to_yml_format(self) -> Dict[str, Any]:
251        """Format dataset index configuration to a yml compatible dictionary.
252
253        Returns:
254            a dictionary containing the dataset index configuration.
255        """
256        yml_format = {
257            TABLE_KEY: self.key,
258            TABLE_NUM_RECORDS: self.num_records
259        }
260
261        if self.file_name is not None:
262            yml_format[TABLE_FILE] = self.file_name
263
264        return yml_format

Format dataset index configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset index configuration.

@dataclass
class RatingMatrixConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
267@dataclass
268class RatingMatrixConfig(YmlConfig):
269    """Rating Matrix Configuration.
270
271    rating_min: the minimum rating in the matrix.
272    rating_max: the maximum rating in the matrix.
273    rating_type: the type of the rating in the matrix, either 'explicit' or 'implicit'.
274    """
275
276    rating_min: float
277    rating_max: float
278    rating_type: str
279
280    def to_yml_format(self) -> Dict[str, Any]:
281        """Format rating matrix configuration to a yml compatible dictionary.
282
283        Returns:
284            a dictionary containing the dataset matrix configuration.
285        """
286        return {
287            KEY_RATING_MIN: self.rating_min,
288            KEY_RATING_MAX: self.rating_max,
289            KEY_RATING_TYPE: self.rating_type,
290        }

Rating Matrix Configuration.

rating_min: the minimum rating in the matrix. rating_max: the maximum rating in the matrix. rating_type: the type of the rating in the matrix, either 'explicit' or 'implicit'.

RatingMatrixConfig(rating_min: float, rating_max: float, rating_type: str)
def to_yml_format(self) -> Dict[str, Any]:
280    def to_yml_format(self) -> Dict[str, Any]:
281        """Format rating matrix configuration to a yml compatible dictionary.
282
283        Returns:
284            a dictionary containing the dataset matrix configuration.
285        """
286        return {
287            KEY_RATING_MIN: self.rating_min,
288            KEY_RATING_MAX: self.rating_max,
289            KEY_RATING_TYPE: self.rating_type,
290        }

Format rating matrix configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset matrix configuration.

@dataclass
class DatasetMatrixConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
294@dataclass
295class DatasetMatrixConfig(YmlConfig):
296    """Dataset Matrix Configuration.
297
298    table: the table configuration of the matrix.
299    rating_matrix: the ratings of the matrix.
300    user: the dataset index configuration for the users in the matrix.
301    item: the dataset index configuration for the items in the matrix.
302    """
303
304    table: DatasetTableConfig
305    ratings: RatingMatrixConfig
306    user: DatasetIndexConfig
307    item: DatasetIndexConfig
308
309    def load_matrix(self, dataset_dir: str) -> pd.DataFrame:
310        """Load the matrix from the specified directory.
311
312        Args:
313            dataset_dir: directory path to where the dataset matrix is stored.
314
315        Returns:
316            the resulting matrix (iterator).
317        """
318        matrix = self.table.read_table(dataset_dir)
319
320        columns = {
321            self.user.key: 'user',
322            self.item.key: 'item',
323            self.table.columns[0]: 'rating'
324        }
325        if len(self.table.columns) == 2:
326            columns[self.table.columns[1]] = 'timestamp'
327
328        return matrix.rename(columns=columns)
329
330    def to_yml_format(self) -> Dict[str, Any]:
331        """Format dataset matrix configuration to a yml compatible dictionary.
332
333        Returns:
334            a dictionary containing the dataset matrix configuration.
335        """
336        yml_format = self.ratings.to_yml_format()
337        yml_format.update({
338            KEY_IDX_ITEM: self.item.to_yml_format(),
339            KEY_IDX_USER: self.user.to_yml_format(),
340            KEY_MATRIX: self.table.to_yml_format()
341        })
342        return yml_format

Dataset Matrix Configuration.

table: the table configuration of the matrix. rating_matrix: the ratings of the matrix. user: the dataset index configuration for the users in the matrix. item: the dataset index configuration for the items in the matrix.

DatasetMatrixConfig( table: src.fairreckitlib.data.set.dataset_config.DatasetTableConfig, ratings: src.fairreckitlib.data.set.dataset_config.RatingMatrixConfig, user: src.fairreckitlib.data.set.dataset_config.DatasetIndexConfig, item: src.fairreckitlib.data.set.dataset_config.DatasetIndexConfig)
def load_matrix(self, dataset_dir: str) -> pandas.core.frame.DataFrame:
309    def load_matrix(self, dataset_dir: str) -> pd.DataFrame:
310        """Load the matrix from the specified directory.
311
312        Args:
313            dataset_dir: directory path to where the dataset matrix is stored.
314
315        Returns:
316            the resulting matrix (iterator).
317        """
318        matrix = self.table.read_table(dataset_dir)
319
320        columns = {
321            self.user.key: 'user',
322            self.item.key: 'item',
323            self.table.columns[0]: 'rating'
324        }
325        if len(self.table.columns) == 2:
326            columns[self.table.columns[1]] = 'timestamp'
327
328        return matrix.rename(columns=columns)

Load the matrix from the specified directory.

Args: dataset_dir: directory path to where the dataset matrix is stored.

Returns: the resulting matrix (iterator).

def to_yml_format(self) -> Dict[str, Any]:
330    def to_yml_format(self) -> Dict[str, Any]:
331        """Format dataset matrix configuration to a yml compatible dictionary.
332
333        Returns:
334            a dictionary containing the dataset matrix configuration.
335        """
336        yml_format = self.ratings.to_yml_format()
337        yml_format.update({
338            KEY_IDX_ITEM: self.item.to_yml_format(),
339            KEY_IDX_USER: self.user.to_yml_format(),
340            KEY_MATRIX: self.table.to_yml_format()
341        })
342        return yml_format

Format dataset matrix configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset matrix configuration.

@dataclass
class DatasetConfig(src.fairreckitlib.core.config.config_yml.YmlConfig):
345@dataclass
346class DatasetConfig(YmlConfig):
347    """Dataset Configuration.
348
349    dataset_name: the name of the dataset.
350    events: dictionary containing the available user event tables.
351    matrices: dictionary containing the available user-item matrices.
352    tables: dictionary containing the (additionally) available tables.
353    """
354
355    dataset_name: str
356    events: Dict[str, DatasetTableConfig]
357    matrices: Dict[str, DatasetMatrixConfig]
358    tables: Dict[str, DatasetTableConfig]
359
360    def get_available_columns(self, matrix_name: str) -> Dict[str, List[str]]:
361        """Get the available columns of the specified matrix.
362
363        Only the table names and columns that have a one-to-one relation will be returned.
364        This function does not raise errors and will return an empty dictionary when
365        the specified matrix is not present in the dataset.
366
367        Args:
368            matrix_name: the name of the matrix to get the available columns of.
369
370        Returns:
371            a dictionary containing the table names as key and the available columns as value.
372        """
373        if matrix_name not in self.matrices:
374            return {}
375
376        matrix_config = self.matrices[matrix_name]
377        result = {KEY_MATRIX: matrix_config.table.columns}
378
379        user_key = [matrix_config.user.key]
380        item_key = [matrix_config.item.key]
381        user_item_key = [user_key, item_key]
382
383        for table_name, table_config in self.tables.items():
384            key = table_config.primary_key
385            if key in (user_key, item_key, user_item_key):
386                result[table_name] = table_config.columns
387
388        return result
389
390    def to_yml_format(self) -> Dict[str, Any]:
391        """Format dataset configuration to a yml compatible dictionary.
392
393        Returns:
394            a dictionary containing the dataset configuration.
395        """
396        yml_format = {KEY_DATASET: self.dataset_name}
397
398        if len(self.events) > 0:
399            yml_format[KEY_EVENTS] = format_yml_config_dict(self.events)
400        if len(self.matrices) > 0:
401            yml_format[KEY_MATRICES] = format_yml_config_dict(self.matrices)
402        if len(self.tables) > 0:
403            yml_format[KEY_TABLES] = format_yml_config_dict(self.tables)
404
405        return yml_format

Dataset Configuration.

dataset_name: the name of the dataset. events: dictionary containing the available user event tables. matrices: dictionary containing the available user-item matrices. tables: dictionary containing the (additionally) available tables.

DatasetConfig( dataset_name: str, events: Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetTableConfig], matrices: Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig], tables: Dict[str, src.fairreckitlib.data.set.dataset_config.DatasetTableConfig])
def get_available_columns(self, matrix_name: str) -> Dict[str, List[str]]:
360    def get_available_columns(self, matrix_name: str) -> Dict[str, List[str]]:
361        """Get the available columns of the specified matrix.
362
363        Only the table names and columns that have a one-to-one relation will be returned.
364        This function does not raise errors and will return an empty dictionary when
365        the specified matrix is not present in the dataset.
366
367        Args:
368            matrix_name: the name of the matrix to get the available columns of.
369
370        Returns:
371            a dictionary containing the table names as key and the available columns as value.
372        """
373        if matrix_name not in self.matrices:
374            return {}
375
376        matrix_config = self.matrices[matrix_name]
377        result = {KEY_MATRIX: matrix_config.table.columns}
378
379        user_key = [matrix_config.user.key]
380        item_key = [matrix_config.item.key]
381        user_item_key = [user_key, item_key]
382
383        for table_name, table_config in self.tables.items():
384            key = table_config.primary_key
385            if key in (user_key, item_key, user_item_key):
386                result[table_name] = table_config.columns
387
388        return result

Get the available columns of the specified matrix.

Only the table names and columns that have a one-to-one relation will be returned. This function does not raise errors and will return an empty dictionary when the specified matrix is not present in the dataset.

Args: matrix_name: the name of the matrix to get the available columns of.

Returns: a dictionary containing the table names as key and the available columns as value.

def to_yml_format(self) -> Dict[str, Any]:
390    def to_yml_format(self) -> Dict[str, Any]:
391        """Format dataset configuration to a yml compatible dictionary.
392
393        Returns:
394            a dictionary containing the dataset configuration.
395        """
396        yml_format = {KEY_DATASET: self.dataset_name}
397
398        if len(self.events) > 0:
399            yml_format[KEY_EVENTS] = format_yml_config_dict(self.events)
400        if len(self.matrices) > 0:
401            yml_format[KEY_MATRICES] = format_yml_config_dict(self.matrices)
402        if len(self.tables) > 0:
403            yml_format[KEY_TABLES] = format_yml_config_dict(self.tables)
404
405        return yml_format

Format dataset configuration to a yml compatible dictionary.

Returns: a dictionary containing the dataset configuration.

def create_dataset_table_config( file_name: str, primary_key: List[str], columns: List[str], *, compression: str = None, encoding: str = None, foreign_keys: List[str] = None, header: bool = False, num_records: int = 0, sep: str = None) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
408def create_dataset_table_config(
409        file_name: str,
410        primary_key: List[str],
411        columns: List[str],
412        *,
413        compression: str=None,
414        encoding: str=None,
415        foreign_keys: List[str]=None,
416        header: bool=False,
417        num_records: int=0,
418        sep: str=None) -> DatasetTableConfig:
419    """Create a dataset table configuration.
420
421    Args:
422        file_name: name of the dataset table file.
423        primary_key: a list of strings that are combined the primary key of the table.
424        columns: a list of strings with other available columns in the table.
425        compression:  the (optional) compression of the file, 'bz2' is recommended.
426        encoding: the encoding for reading/writing the table contents or None for 'utf-8'.
427        foreign_keys: (optional) list of column names that are foreign keys in other tables.
428        header: whether the table file contains a header on the first line.
429        num_records: the number of records in the table.
430        sep: the delimiter that is used in the table or None for a tab separator.
431
432    Returns:
433        the resulting data table configuration.
434    """
435    return DatasetTableConfig(
436        primary_key,
437        foreign_keys,
438        columns,
439        num_records,
440        DatasetFileConfig(
441            file_name,
442            FileOptionsConfig(
443                sep,
444                compression,
445                encoding,
446                header
447            )
448        )
449    )

Create a dataset table configuration.

Args: file_name: name of the dataset table file. primary_key: a list of strings that are combined the primary key of the table. columns: a list of strings with other available columns in the table. compression: the (optional) compression of the file, 'bz2' is recommended. encoding: the encoding for reading/writing the table contents or None for 'utf-8'. foreign_keys: (optional) list of column names that are foreign keys in other tables. header: whether the table file contains a header on the first line. num_records: the number of records in the table. sep: the delimiter that is used in the table or None for a tab separator.

Returns: the resulting data table configuration.