src.fairreckitlib.data.set.processor.dataset_processor_ml25m
This modules contains the class to process the MovieLens-25M dataset.
Classes:
DatasetProcessorML25M: data processor implementation for the ML-25M dataset.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This modules contains the class to process the MovieLens-25M dataset. 2 3Classes: 4 5 DatasetProcessorML25M: data processor implementation for the ML-25M dataset. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12from typing import Callable, List, Optional, Tuple 13 14import pandas as pd 15 16from ..dataset_config import DatasetTableConfig 17from ..dataset_config import create_dataset_table_config 18from ..dataset_constants import TABLE_FILE_PREFIX 19from .dataset_processor_ml import DatasetProcessorML 20 21 22class DatasetProcessorML25M(DatasetProcessorML): 23 """DatasetProcessor for the MovieLens-25M dataset. 24 25 The dataset can be downloaded from the link below. 26 https://files.grouplens.org/datasets/movielens/ml-25m.zip 27 28 The processor handles the following files: 29 30 genome-scores.csv (optional) 31 genome-tags.csv (optional) 32 links.csv (optional) 33 movies.csv (optional) 34 ratings.csv (required) 35 tags.csv (optional) 36 """ 37 38 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 39 """Create the user-movie matrix configuration. 40 41 Returns: 42 the table configuration of the ML-25M matrix. 43 """ 44 return create_dataset_table_config( 45 'ratings.csv', 46 ['user_id', 'movie_id'], 47 ['matrix_rating', 'matrix_timestamp'], 48 foreign_keys=['user_id', 'movie_id'], 49 header=True, 50 sep=',' 51 ) 52 53 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 54 """Get event table configuration processors. 55 56 Returns: 57 a list containing the tag event table processor. 58 """ 59 return [('tag', self.process_tag_table)] 60 61 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 62 """Get table configuration processors. 63 64 Returns: 65 a list containing the genome score, genome tag, movie and tag table processors. 66 """ 67 return [ 68 ('genome score', self.process_genome_score_table), 69 ('genome tag', self.process_genome_tag_table), 70 ('movie', self.process_movie_table) 71 ] 72 73 def process_genome_score_table(self) -> Optional[DatasetTableConfig]: 74 """Process the genome score table. 75 76 Returns: 77 the genome score table configuration or None on failure. 78 """ 79 genome_score_table_config = create_dataset_table_config( 80 'genome-scores.csv', 81 ['movie_id', 'tag_id'], 82 ['movie-tag_relevance'], 83 foreign_keys=['movie_id', 'tag_id'], 84 header=True, 85 sep=',' 86 ) 87 88 try: 89 genome_score_table = genome_score_table_config.read_table(self.dataset_dir) 90 genome_score_table_config.num_records = len(genome_score_table) 91 return genome_score_table_config 92 except FileNotFoundError: 93 return None 94 95 def process_genome_tag_table(self) -> Optional[DatasetTableConfig]: 96 """Process the genome tag table. 97 98 Returns: 99 the genome tag table configuration or None on failure. 100 """ 101 genome_tag_table_config = create_dataset_table_config( 102 'genome-tags.csv', 103 ['tag_id'], 104 ['tag_name'], 105 header=True, 106 sep=',' 107 ) 108 109 try: 110 genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir) 111 genome_tag_table_config.num_records = len(genome_tag_table) 112 return genome_tag_table_config 113 except FileNotFoundError: 114 return None 115 116 def process_movie_table(self) -> Optional[DatasetTableConfig]: 117 """Process the movie table. 118 119 The movie and link tables are joined together for simplification. 120 121 Returns: 122 the movie table configuration or None on failure. 123 """ 124 link_table_config = create_dataset_table_config( 125 'links.csv', 126 ['movie_id'], 127 ['movie_imdbID', 'movie_tmdbID'], 128 header=True, 129 sep=',' 130 ) 131 132 try: 133 link_table = link_table_config.read_table(self.dataset_dir) 134 link_table_config.num_records = len(link_table) 135 # replace NaN and cast back to original int 136 link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int) 137 except FileNotFoundError: 138 link_table = None 139 140 movie_table_config = create_dataset_table_config( 141 'movies.csv', 142 ['movie_id'], 143 ['movie_title', 'movie_genres'], 144 header=True, 145 sep=',' 146 ) 147 148 try: 149 movie_table = movie_table_config.read_table(self.dataset_dir) 150 movie_table_config.num_records = len(movie_table) 151 except FileNotFoundError: 152 return link_table_config if link_table is not None else None 153 154 if link_table is not None: 155 # merge movie and link tables 156 movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id') 157 158 # update movie table configuration 159 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 160 movie_table_config.file.options.sep = None 161 movie_table_config.file.options.compression = 'bz2' 162 movie_table_config.file.options.header = False 163 movie_table_config.columns += link_table_config.columns 164 165 # store the extended movie table 166 movie_table_config.save_table(movie_table, self.dataset_dir) 167 168 return movie_table_config 169 170 def process_tag_table(self) -> Optional[DatasetTableConfig]: 171 """Process the tag (event) table. 172 173 Returns: 174 the tag table configuration or None on failure. 175 """ 176 tag_table_config = create_dataset_table_config( 177 'tags.csv', 178 ['user_id', 'movie_id'], 179 ['tag_name','tag_timestamp'], 180 foreign_keys=['user_id', 'movie_id'], 181 header=True, 182 sep=',' 183 ) 184 185 try: 186 tag_table = tag_table_config.read_table(self.dataset_dir) 187 tag_table_config.num_records = len(tag_table) 188 return tag_table_config 189 except FileNotFoundError: 190 return None
23class DatasetProcessorML25M(DatasetProcessorML): 24 """DatasetProcessor for the MovieLens-25M dataset. 25 26 The dataset can be downloaded from the link below. 27 https://files.grouplens.org/datasets/movielens/ml-25m.zip 28 29 The processor handles the following files: 30 31 genome-scores.csv (optional) 32 genome-tags.csv (optional) 33 links.csv (optional) 34 movies.csv (optional) 35 ratings.csv (required) 36 tags.csv (optional) 37 """ 38 39 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 40 """Create the user-movie matrix configuration. 41 42 Returns: 43 the table configuration of the ML-25M matrix. 44 """ 45 return create_dataset_table_config( 46 'ratings.csv', 47 ['user_id', 'movie_id'], 48 ['matrix_rating', 'matrix_timestamp'], 49 foreign_keys=['user_id', 'movie_id'], 50 header=True, 51 sep=',' 52 ) 53 54 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 55 """Get event table configuration processors. 56 57 Returns: 58 a list containing the tag event table processor. 59 """ 60 return [('tag', self.process_tag_table)] 61 62 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 63 """Get table configuration processors. 64 65 Returns: 66 a list containing the genome score, genome tag, movie and tag table processors. 67 """ 68 return [ 69 ('genome score', self.process_genome_score_table), 70 ('genome tag', self.process_genome_tag_table), 71 ('movie', self.process_movie_table) 72 ] 73 74 def process_genome_score_table(self) -> Optional[DatasetTableConfig]: 75 """Process the genome score table. 76 77 Returns: 78 the genome score table configuration or None on failure. 79 """ 80 genome_score_table_config = create_dataset_table_config( 81 'genome-scores.csv', 82 ['movie_id', 'tag_id'], 83 ['movie-tag_relevance'], 84 foreign_keys=['movie_id', 'tag_id'], 85 header=True, 86 sep=',' 87 ) 88 89 try: 90 genome_score_table = genome_score_table_config.read_table(self.dataset_dir) 91 genome_score_table_config.num_records = len(genome_score_table) 92 return genome_score_table_config 93 except FileNotFoundError: 94 return None 95 96 def process_genome_tag_table(self) -> Optional[DatasetTableConfig]: 97 """Process the genome tag table. 98 99 Returns: 100 the genome tag table configuration or None on failure. 101 """ 102 genome_tag_table_config = create_dataset_table_config( 103 'genome-tags.csv', 104 ['tag_id'], 105 ['tag_name'], 106 header=True, 107 sep=',' 108 ) 109 110 try: 111 genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir) 112 genome_tag_table_config.num_records = len(genome_tag_table) 113 return genome_tag_table_config 114 except FileNotFoundError: 115 return None 116 117 def process_movie_table(self) -> Optional[DatasetTableConfig]: 118 """Process the movie table. 119 120 The movie and link tables are joined together for simplification. 121 122 Returns: 123 the movie table configuration or None on failure. 124 """ 125 link_table_config = create_dataset_table_config( 126 'links.csv', 127 ['movie_id'], 128 ['movie_imdbID', 'movie_tmdbID'], 129 header=True, 130 sep=',' 131 ) 132 133 try: 134 link_table = link_table_config.read_table(self.dataset_dir) 135 link_table_config.num_records = len(link_table) 136 # replace NaN and cast back to original int 137 link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int) 138 except FileNotFoundError: 139 link_table = None 140 141 movie_table_config = create_dataset_table_config( 142 'movies.csv', 143 ['movie_id'], 144 ['movie_title', 'movie_genres'], 145 header=True, 146 sep=',' 147 ) 148 149 try: 150 movie_table = movie_table_config.read_table(self.dataset_dir) 151 movie_table_config.num_records = len(movie_table) 152 except FileNotFoundError: 153 return link_table_config if link_table is not None else None 154 155 if link_table is not None: 156 # merge movie and link tables 157 movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id') 158 159 # update movie table configuration 160 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 161 movie_table_config.file.options.sep = None 162 movie_table_config.file.options.compression = 'bz2' 163 movie_table_config.file.options.header = False 164 movie_table_config.columns += link_table_config.columns 165 166 # store the extended movie table 167 movie_table_config.save_table(movie_table, self.dataset_dir) 168 169 return movie_table_config 170 171 def process_tag_table(self) -> Optional[DatasetTableConfig]: 172 """Process the tag (event) table. 173 174 Returns: 175 the tag table configuration or None on failure. 176 """ 177 tag_table_config = create_dataset_table_config( 178 'tags.csv', 179 ['user_id', 'movie_id'], 180 ['tag_name','tag_timestamp'], 181 foreign_keys=['user_id', 'movie_id'], 182 header=True, 183 sep=',' 184 ) 185 186 try: 187 tag_table = tag_table_config.read_table(self.dataset_dir) 188 tag_table_config.num_records = len(tag_table) 189 return tag_table_config 190 except FileNotFoundError: 191 return None
DatasetProcessor for the MovieLens-25M dataset.
The dataset can be downloaded from the link below. https://files.grouplens.org/datasets/movielens/ml-25m.zip
The processor handles the following files:
genome-scores.csv (optional) genome-tags.csv (optional) links.csv (optional) movies.csv (optional) ratings.csv (required) tags.csv (optional)
39 def create_user_movie_matrix_config(self) -> DatasetTableConfig: 40 """Create the user-movie matrix configuration. 41 42 Returns: 43 the table configuration of the ML-25M matrix. 44 """ 45 return create_dataset_table_config( 46 'ratings.csv', 47 ['user_id', 'movie_id'], 48 ['matrix_rating', 'matrix_timestamp'], 49 foreign_keys=['user_id', 'movie_id'], 50 header=True, 51 sep=',' 52 )
Create the user-movie matrix configuration.
Returns: the table configuration of the ML-25M matrix.
54 def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 55 """Get event table configuration processors. 56 57 Returns: 58 a list containing the tag event table processor. 59 """ 60 return [('tag', self.process_tag_table)]
Get event table configuration processors.
Returns: a list containing the tag event table processor.
62 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 63 """Get table configuration processors. 64 65 Returns: 66 a list containing the genome score, genome tag, movie and tag table processors. 67 """ 68 return [ 69 ('genome score', self.process_genome_score_table), 70 ('genome tag', self.process_genome_tag_table), 71 ('movie', self.process_movie_table) 72 ]
Get table configuration processors.
Returns: a list containing the genome score, genome tag, movie and tag table processors.
74 def process_genome_score_table(self) -> Optional[DatasetTableConfig]: 75 """Process the genome score table. 76 77 Returns: 78 the genome score table configuration or None on failure. 79 """ 80 genome_score_table_config = create_dataset_table_config( 81 'genome-scores.csv', 82 ['movie_id', 'tag_id'], 83 ['movie-tag_relevance'], 84 foreign_keys=['movie_id', 'tag_id'], 85 header=True, 86 sep=',' 87 ) 88 89 try: 90 genome_score_table = genome_score_table_config.read_table(self.dataset_dir) 91 genome_score_table_config.num_records = len(genome_score_table) 92 return genome_score_table_config 93 except FileNotFoundError: 94 return None
Process the genome score table.
Returns: the genome score table configuration or None on failure.
96 def process_genome_tag_table(self) -> Optional[DatasetTableConfig]: 97 """Process the genome tag table. 98 99 Returns: 100 the genome tag table configuration or None on failure. 101 """ 102 genome_tag_table_config = create_dataset_table_config( 103 'genome-tags.csv', 104 ['tag_id'], 105 ['tag_name'], 106 header=True, 107 sep=',' 108 ) 109 110 try: 111 genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir) 112 genome_tag_table_config.num_records = len(genome_tag_table) 113 return genome_tag_table_config 114 except FileNotFoundError: 115 return None
Process the genome tag table.
Returns: the genome tag table configuration or None on failure.
117 def process_movie_table(self) -> Optional[DatasetTableConfig]: 118 """Process the movie table. 119 120 The movie and link tables are joined together for simplification. 121 122 Returns: 123 the movie table configuration or None on failure. 124 """ 125 link_table_config = create_dataset_table_config( 126 'links.csv', 127 ['movie_id'], 128 ['movie_imdbID', 'movie_tmdbID'], 129 header=True, 130 sep=',' 131 ) 132 133 try: 134 link_table = link_table_config.read_table(self.dataset_dir) 135 link_table_config.num_records = len(link_table) 136 # replace NaN and cast back to original int 137 link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int) 138 except FileNotFoundError: 139 link_table = None 140 141 movie_table_config = create_dataset_table_config( 142 'movies.csv', 143 ['movie_id'], 144 ['movie_title', 'movie_genres'], 145 header=True, 146 sep=',' 147 ) 148 149 try: 150 movie_table = movie_table_config.read_table(self.dataset_dir) 151 movie_table_config.num_records = len(movie_table) 152 except FileNotFoundError: 153 return link_table_config if link_table is not None else None 154 155 if link_table is not None: 156 # merge movie and link tables 157 movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id') 158 159 # update movie table configuration 160 movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2' 161 movie_table_config.file.options.sep = None 162 movie_table_config.file.options.compression = 'bz2' 163 movie_table_config.file.options.header = False 164 movie_table_config.columns += link_table_config.columns 165 166 # store the extended movie table 167 movie_table_config.save_table(movie_table, self.dataset_dir) 168 169 return movie_table_config
Process the movie table.
The movie and link tables are joined together for simplification.
Returns: the movie table configuration or None on failure.
171 def process_tag_table(self) -> Optional[DatasetTableConfig]: 172 """Process the tag (event) table. 173 174 Returns: 175 the tag table configuration or None on failure. 176 """ 177 tag_table_config = create_dataset_table_config( 178 'tags.csv', 179 ['user_id', 'movie_id'], 180 ['tag_name','tag_timestamp'], 181 foreign_keys=['user_id', 'movie_id'], 182 header=True, 183 sep=',' 184 ) 185 186 try: 187 tag_table = tag_table_config.read_table(self.dataset_dir) 188 tag_table_config.num_records = len(tag_table) 189 return tag_table_config 190 except FileNotFoundError: 191 return None
Process the tag (event) table.
Returns: the tag table configuration or None on failure.