src.fairreckitlib.data.set.processor.dataset_processor_ml
This module contains the base processor for MovieLens datasets.
Classes:
DatasetProcessorML: the base class for MovieLens dataset processors.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This module contains the base processor for MovieLens datasets. 2 3Classes: 4 5 DatasetProcessorML: the base class for MovieLens dataset processors. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12from abc import ABCMeta, abstractmethod 13from typing import Callable, List, Optional, Tuple 14 15import numpy as np 16 17from ..dataset_config import DATASET_RATINGS_EXPLICIT, RatingMatrixConfig 18from ..dataset_config import DatasetIndexConfig, DatasetMatrixConfig, DatasetTableConfig 19from ..dataset_constants import TABLE_FILE_PREFIX 20from .dataset_processor_base import DatasetProcessorBase 21 22 23class DatasetProcessorML(DatasetProcessorBase, metaclass=ABCMeta): 24 """DataProcessor base class for MovieLens datasets. 25 26 Provides an abstraction for processing the user-movie-rating matrix. 27 Moreover, it is assumed that the datasets do not have any event tables. 28 29 Abstract methods: 30 31 create_user_movie_matrix_config 32 """ 33 34 @abstractmethod 35 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 36 """Create the user-movie matrix configuration. 37 38 Returns: 39 the table configuration of the matrix. 40 """ 41 raise NotImplementedError() 42 43 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 44 """Get event table configuration processors. 45 46 Returns: 47 an empty list. 48 """ 49 return [] 50 51 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 52 """Get matrix configuration processors. 53 54 Returns: 55 a list with the user-movie-rating matrix processor. 56 """ 57 return [('user-movie-rating', self.process_user_movie_matrix)] 58 59 def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]: 60 """Process the user-movie-rating matrix. 61 62 Returns: 63 the matrix configuration or None on failure. 64 """ 65 user_movie_matrix_table_config = self.create_user_movie_matrix_config() 66 67 # extract column names from configuration 68 user_id = user_movie_matrix_table_config.primary_key[0] 69 item_id = user_movie_matrix_table_config.primary_key[1] 70 rating_column = user_movie_matrix_table_config.columns[0] 71 72 try: 73 user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir) 74 user_movie_matrix_table_config.num_records = len(user_movie_matrix) 75 except FileNotFoundError: 76 return None 77 78 if user_movie_matrix[rating_column].dtype == np.int64: 79 # convert int ratings 80 user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float) 81 82 # update matrix configuration 83 user_movie_matrix_table_config.file.name = \ 84 TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2' 85 user_movie_matrix_table_config.file.options.sep = None 86 user_movie_matrix_table_config.file.options.compression = 'bz2' 87 user_movie_matrix_table_config.file.options.header = False 88 89 # store resulting matrix 90 user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir) 91 92 return DatasetMatrixConfig( 93 user_movie_matrix_table_config, 94 RatingMatrixConfig( 95 float(user_movie_matrix[rating_column].min()), 96 float(user_movie_matrix[rating_column].max()), 97 DATASET_RATINGS_EXPLICIT 98 ), 99 DatasetIndexConfig( 100 None, 101 user_id, 102 len(user_movie_matrix[user_id].unique()) 103 ), 104 DatasetIndexConfig( 105 None, 106 item_id, 107 len(user_movie_matrix[item_id].unique()) 108 ) 109 )
24class DatasetProcessorML(DatasetProcessorBase, metaclass=ABCMeta): 25 """DataProcessor base class for MovieLens datasets. 26 27 Provides an abstraction for processing the user-movie-rating matrix. 28 Moreover, it is assumed that the datasets do not have any event tables. 29 30 Abstract methods: 31 32 create_user_movie_matrix_config 33 """ 34 35 @abstractmethod 36 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 37 """Create the user-movie matrix configuration. 38 39 Returns: 40 the table configuration of the matrix. 41 """ 42 raise NotImplementedError() 43 44 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 45 """Get event table configuration processors. 46 47 Returns: 48 an empty list. 49 """ 50 return [] 51 52 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 53 """Get matrix configuration processors. 54 55 Returns: 56 a list with the user-movie-rating matrix processor. 57 """ 58 return [('user-movie-rating', self.process_user_movie_matrix)] 59 60 def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]: 61 """Process the user-movie-rating matrix. 62 63 Returns: 64 the matrix configuration or None on failure. 65 """ 66 user_movie_matrix_table_config = self.create_user_movie_matrix_config() 67 68 # extract column names from configuration 69 user_id = user_movie_matrix_table_config.primary_key[0] 70 item_id = user_movie_matrix_table_config.primary_key[1] 71 rating_column = user_movie_matrix_table_config.columns[0] 72 73 try: 74 user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir) 75 user_movie_matrix_table_config.num_records = len(user_movie_matrix) 76 except FileNotFoundError: 77 return None 78 79 if user_movie_matrix[rating_column].dtype == np.int64: 80 # convert int ratings 81 user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float) 82 83 # update matrix configuration 84 user_movie_matrix_table_config.file.name = \ 85 TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2' 86 user_movie_matrix_table_config.file.options.sep = None 87 user_movie_matrix_table_config.file.options.compression = 'bz2' 88 user_movie_matrix_table_config.file.options.header = False 89 90 # store resulting matrix 91 user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir) 92 93 return DatasetMatrixConfig( 94 user_movie_matrix_table_config, 95 RatingMatrixConfig( 96 float(user_movie_matrix[rating_column].min()), 97 float(user_movie_matrix[rating_column].max()), 98 DATASET_RATINGS_EXPLICIT 99 ), 100 DatasetIndexConfig( 101 None, 102 user_id, 103 len(user_movie_matrix[user_id].unique()) 104 ), 105 DatasetIndexConfig( 106 None, 107 item_id, 108 len(user_movie_matrix[item_id].unique()) 109 ) 110 )
DataProcessor base class for MovieLens datasets.
Provides an abstraction for processing the user-movie-rating matrix. Moreover, it is assumed that the datasets do not have any event tables.
Abstract methods:
create_user_movie_matrix_config
35 @abstractmethod 36 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 37 """Create the user-movie matrix configuration. 38 39 Returns: 40 the table configuration of the matrix. 41 """ 42 raise NotImplementedError()
Create the user-movie matrix configuration.
Returns: the table configuration of the matrix.
44 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 45 """Get event table configuration processors. 46 47 Returns: 48 an empty list. 49 """ 50 return []
Get event table configuration processors.
Returns: an empty list.
52 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 53 """Get matrix configuration processors. 54 55 Returns: 56 a list with the user-movie-rating matrix processor. 57 """ 58 return [('user-movie-rating', self.process_user_movie_matrix)]
Get matrix configuration processors.
Returns: a list with the user-movie-rating matrix processor.
60 def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]: 61 """Process the user-movie-rating matrix. 62 63 Returns: 64 the matrix configuration or None on failure. 65 """ 66 user_movie_matrix_table_config = self.create_user_movie_matrix_config() 67 68 # extract column names from configuration 69 user_id = user_movie_matrix_table_config.primary_key[0] 70 item_id = user_movie_matrix_table_config.primary_key[1] 71 rating_column = user_movie_matrix_table_config.columns[0] 72 73 try: 74 user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir) 75 user_movie_matrix_table_config.num_records = len(user_movie_matrix) 76 except FileNotFoundError: 77 return None 78 79 if user_movie_matrix[rating_column].dtype == np.int64: 80 # convert int ratings 81 user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float) 82 83 # update matrix configuration 84 user_movie_matrix_table_config.file.name = \ 85 TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2' 86 user_movie_matrix_table_config.file.options.sep = None 87 user_movie_matrix_table_config.file.options.compression = 'bz2' 88 user_movie_matrix_table_config.file.options.header = False 89 90 # store resulting matrix 91 user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir) 92 93 return DatasetMatrixConfig( 94 user_movie_matrix_table_config, 95 RatingMatrixConfig( 96 float(user_movie_matrix[rating_column].min()), 97 float(user_movie_matrix[rating_column].max()), 98 DATASET_RATINGS_EXPLICIT 99 ), 100 DatasetIndexConfig( 101 None, 102 user_id, 103 len(user_movie_matrix[user_id].unique()) 104 ), 105 DatasetIndexConfig( 106 None, 107 item_id, 108 len(user_movie_matrix[item_id].unique()) 109 ) 110 )
Process the user-movie-rating matrix.
Returns: the matrix configuration or None on failure.