src.fairreckitlib.data.set.processor.dataset_processor_ml100k
This modules contains the class to process the MovieLens-100K dataset.
Classes:
DatasetProcessorML100K: data processor implementation for the ML-100K dataset.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This modules contains the class to process the MovieLens-100K dataset. 2 3Classes: 4 5 DatasetProcessorML100K: data processor implementation for the ML-100K dataset. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12from typing import Callable, List, Optional, Tuple 13 14import numpy as np 15 16from ..dataset_config import DatasetTableConfig 17from ..dataset_config import create_dataset_table_config 18from ..dataset_constants import TABLE_FILE_PREFIX 19from .dataset_processor_ml import DatasetProcessorML 20 21MOVIE_GENRES = [ 22 'Unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 23 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 24 'Thriller', 'War', 'Western' 25] 26 27 28class DatasetProcessorML100K(DatasetProcessorML): 29 """DatasetProcessor for the MovieLens-100K dataset. 30 31 The dataset can be downloaded from the link below. 32 https://files.grouplens.org/datasets/movielens/ml-100k.zip 33 34 The processor handles the following files: 35 36 u.data (required) 37 u.user (optional) 38 u.item (optional) 39 """ 40 41 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 42 """Create the user-movie matrix configuration. 43 44 Returns: 45 the table configuration of the ML-100K matrix. 46 """ 47 return create_dataset_table_config( 48 'u.data', 49 ['user_id', 'movie_id'], 50 ['matrix_rating', 'matrix_timestamp'], 51 foreign_keys=['user_id', 'movie_id'] 52 ) 53 54 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 55 """Get table configuration processors. 56 57 Returns: 58 a list containing the user and movie table processors. 59 """ 60 return [ 61 ('movie', self.process_movie_table), 62 ('user', self.process_user_table) 63 ] 64 65 def process_movie_table(self) -> Optional[DatasetTableConfig]: 66 """Process the movie table. 67 68 Removes an empty release date column that is included in the movie title. 69 Simplifies the binary genre columns by concatenating the names using pipes. 70 71 Returns: 72 the movie table configuration or None on failure. 73 """ 74 movie_columns = [ 75 'movie_title', 76 'movie_release date', 77 'empty', # this column does not contain any data 78 'movie_imdb url' 79 ] 80 81 # create original table definition 82 movie_table_config = create_dataset_table_config( 83 'u.item', 84 ['movie_id'], 85 movie_columns + MOVIE_GENRES, 86 sep='|', 87 encoding='ISO-8859-1' 88 ) 89 90 try: 91 # read the original table without binary genres 92 movie_table = movie_table_config.read_table( 93 self.dataset_dir, 94 columns=movie_table_config.primary_key + movie_columns 95 ) 96 # read the binary genres table 97 genres_table = movie_table_config.read_table( 98 self.dataset_dir, 99 columns=MOVIE_GENRES 100 ) 101 except FileNotFoundError: 102 return None 103 104 # drop and remove the empty column 105 movie_columns.remove('empty') 106 movie_table.drop('empty', axis=1, inplace=True) 107 movie_table = movie_table[movie_table_config.primary_key + movie_columns] 108 109 # replace 0 with NaN and 1 with the corresponding genre 110 for column_name in genres_table: 111 genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True) 112 113 # collapse genres into one column and add it to the original table 114 genre_column = 'movie_genres' 115 movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1) 116 117 # update movie table definition 118 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 119 movie_table_config.file.options.compression = 'bz2' 120 movie_table_config.file.options.sep = None 121 movie_table_config.columns = movie_columns + [genre_column] 122 movie_table_config.num_records = len(movie_table) 123 124 # store the generated movie table 125 movie_table_config.save_table(movie_table, self.dataset_dir) 126 127 return movie_table_config 128 129 def process_user_table(self) -> Optional[DatasetTableConfig]: 130 """Process the user table. 131 132 Changes the contents of the gender and occupation columns to be more user-friendly. 133 134 Returns: 135 the user table configuration or None on failure. 136 """ 137 user_table_config = create_dataset_table_config( 138 'u.user', 139 ['user_id'], 140 ['user_age', 'user_gender', 'user_occupation', 'user_zip code'], 141 sep='|' 142 ) 143 144 try: 145 user_table = user_table_config.read_table(self.dataset_dir) 146 user_table_config.num_records=len(user_table) 147 except FileNotFoundError: 148 return None 149 150 # convert gender and occupation to more user-friendly names 151 user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True) 152 user_table['user_occupation'] = user_table['user_occupation'].str.capitalize() 153 154 # update user table configuration 155 user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2' 156 user_table_config.file.options.compression = 'bz2' 157 user_table_config.file.options.sep = None 158 159 # store the generated user table 160 user_table_config.save_table(user_table, self.dataset_dir) 161 162 return user_table_config
29class DatasetProcessorML100K(DatasetProcessorML): 30 """DatasetProcessor for the MovieLens-100K dataset. 31 32 The dataset can be downloaded from the link below. 33 https://files.grouplens.org/datasets/movielens/ml-100k.zip 34 35 The processor handles the following files: 36 37 u.data (required) 38 u.user (optional) 39 u.item (optional) 40 """ 41 42 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 43 """Create the user-movie matrix configuration. 44 45 Returns: 46 the table configuration of the ML-100K matrix. 47 """ 48 return create_dataset_table_config( 49 'u.data', 50 ['user_id', 'movie_id'], 51 ['matrix_rating', 'matrix_timestamp'], 52 foreign_keys=['user_id', 'movie_id'] 53 ) 54 55 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 56 """Get table configuration processors. 57 58 Returns: 59 a list containing the user and movie table processors. 60 """ 61 return [ 62 ('movie', self.process_movie_table), 63 ('user', self.process_user_table) 64 ] 65 66 def process_movie_table(self) -> Optional[DatasetTableConfig]: 67 """Process the movie table. 68 69 Removes an empty release date column that is included in the movie title. 70 Simplifies the binary genre columns by concatenating the names using pipes. 71 72 Returns: 73 the movie table configuration or None on failure. 74 """ 75 movie_columns = [ 76 'movie_title', 77 'movie_release date', 78 'empty', # this column does not contain any data 79 'movie_imdb url' 80 ] 81 82 # create original table definition 83 movie_table_config = create_dataset_table_config( 84 'u.item', 85 ['movie_id'], 86 movie_columns + MOVIE_GENRES, 87 sep='|', 88 encoding='ISO-8859-1' 89 ) 90 91 try: 92 # read the original table without binary genres 93 movie_table = movie_table_config.read_table( 94 self.dataset_dir, 95 columns=movie_table_config.primary_key + movie_columns 96 ) 97 # read the binary genres table 98 genres_table = movie_table_config.read_table( 99 self.dataset_dir, 100 columns=MOVIE_GENRES 101 ) 102 except FileNotFoundError: 103 return None 104 105 # drop and remove the empty column 106 movie_columns.remove('empty') 107 movie_table.drop('empty', axis=1, inplace=True) 108 movie_table = movie_table[movie_table_config.primary_key + movie_columns] 109 110 # replace 0 with NaN and 1 with the corresponding genre 111 for column_name in genres_table: 112 genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True) 113 114 # collapse genres into one column and add it to the original table 115 genre_column = 'movie_genres' 116 movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1) 117 118 # update movie table definition 119 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 120 movie_table_config.file.options.compression = 'bz2' 121 movie_table_config.file.options.sep = None 122 movie_table_config.columns = movie_columns + [genre_column] 123 movie_table_config.num_records = len(movie_table) 124 125 # store the generated movie table 126 movie_table_config.save_table(movie_table, self.dataset_dir) 127 128 return movie_table_config 129 130 def process_user_table(self) -> Optional[DatasetTableConfig]: 131 """Process the user table. 132 133 Changes the contents of the gender and occupation columns to be more user-friendly. 134 135 Returns: 136 the user table configuration or None on failure. 137 """ 138 user_table_config = create_dataset_table_config( 139 'u.user', 140 ['user_id'], 141 ['user_age', 'user_gender', 'user_occupation', 'user_zip code'], 142 sep='|' 143 ) 144 145 try: 146 user_table = user_table_config.read_table(self.dataset_dir) 147 user_table_config.num_records=len(user_table) 148 except FileNotFoundError: 149 return None 150 151 # convert gender and occupation to more user-friendly names 152 user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True) 153 user_table['user_occupation'] = user_table['user_occupation'].str.capitalize() 154 155 # update user table configuration 156 user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2' 157 user_table_config.file.options.compression = 'bz2' 158 user_table_config.file.options.sep = None 159 160 # store the generated user table 161 user_table_config.save_table(user_table, self.dataset_dir) 162 163 return user_table_config
DatasetProcessor for the MovieLens-100K dataset.
The dataset can be downloaded from the link below. https://files.grouplens.org/datasets/movielens/ml-100k.zip
The processor handles the following files:
u.data (required) u.user (optional) u.item (optional)
42 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 43 """Create the user-movie matrix configuration. 44 45 Returns: 46 the table configuration of the ML-100K matrix. 47 """ 48 return create_dataset_table_config( 49 'u.data', 50 ['user_id', 'movie_id'], 51 ['matrix_rating', 'matrix_timestamp'], 52 foreign_keys=['user_id', 'movie_id'] 53 )
Create the user-movie matrix configuration.
Returns: the table configuration of the ML-100K matrix.
55 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 56 """Get table configuration processors. 57 58 Returns: 59 a list containing the user and movie table processors. 60 """ 61 return [ 62 ('movie', self.process_movie_table), 63 ('user', self.process_user_table) 64 ]
Get table configuration processors.
Returns: a list containing the user and movie table processors.
66 def process_movie_table(self) -> Optional[DatasetTableConfig]: 67 """Process the movie table. 68 69 Removes an empty release date column that is included in the movie title. 70 Simplifies the binary genre columns by concatenating the names using pipes. 71 72 Returns: 73 the movie table configuration or None on failure. 74 """ 75 movie_columns = [ 76 'movie_title', 77 'movie_release date', 78 'empty', # this column does not contain any data 79 'movie_imdb url' 80 ] 81 82 # create original table definition 83 movie_table_config = create_dataset_table_config( 84 'u.item', 85 ['movie_id'], 86 movie_columns + MOVIE_GENRES, 87 sep='|', 88 encoding='ISO-8859-1' 89 ) 90 91 try: 92 # read the original table without binary genres 93 movie_table = movie_table_config.read_table( 94 self.dataset_dir, 95 columns=movie_table_config.primary_key + movie_columns 96 ) 97 # read the binary genres table 98 genres_table = movie_table_config.read_table( 99 self.dataset_dir, 100 columns=MOVIE_GENRES 101 ) 102 except FileNotFoundError: 103 return None 104 105 # drop and remove the empty column 106 movie_columns.remove('empty') 107 movie_table.drop('empty', axis=1, inplace=True) 108 movie_table = movie_table[movie_table_config.primary_key + movie_columns] 109 110 # replace 0 with NaN and 1 with the corresponding genre 111 for column_name in genres_table: 112 genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True) 113 114 # collapse genres into one column and add it to the original table 115 genre_column = 'movie_genres' 116 movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1) 117 118 # update movie table definition 119 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 120 movie_table_config.file.options.compression = 'bz2' 121 movie_table_config.file.options.sep = None 122 movie_table_config.columns = movie_columns + [genre_column] 123 movie_table_config.num_records = len(movie_table) 124 125 # store the generated movie table 126 movie_table_config.save_table(movie_table, self.dataset_dir) 127 128 return movie_table_config
Process the movie table.
Removes an empty release date column that is included in the movie title. Simplifies the binary genre columns by concatenating the names using pipes.
Returns: the movie table configuration or None on failure.
130 def process_user_table(self) -> Optional[DatasetTableConfig]: 131 """Process the user table. 132 133 Changes the contents of the gender and occupation columns to be more user-friendly. 134 135 Returns: 136 the user table configuration or None on failure. 137 """ 138 user_table_config = create_dataset_table_config( 139 'u.user', 140 ['user_id'], 141 ['user_age', 'user_gender', 'user_occupation', 'user_zip code'], 142 sep='|' 143 ) 144 145 try: 146 user_table = user_table_config.read_table(self.dataset_dir) 147 user_table_config.num_records=len(user_table) 148 except FileNotFoundError: 149 return None 150 151 # convert gender and occupation to more user-friendly names 152 user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True) 153 user_table['user_occupation'] = user_table['user_occupation'].str.capitalize() 154 155 # update user table configuration 156 user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2' 157 user_table_config.file.options.compression = 'bz2' 158 user_table_config.file.options.sep = None 159 160 # store the generated user table 161 user_table_config.save_table(user_table, self.dataset_dir) 162 163 return user_table_config
Process the user table.
Changes the contents of the gender and occupation columns to be more user-friendly.
Returns: the user table configuration or None on failure.