src.fairreckitlib.data.set.processor.dataset_processor_lfm1b
This modules contains the class to process the LastFM-1B dataset.
Classes:
DatasetProcessorLFM1B: data processor implementation for the LFM-1B dataset.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This modules contains the class to process the LastFM-1B dataset. 2 3Classes: 4 5 DatasetProcessorLFM1B: data processor implementation for the LFM-1B dataset. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12import os 13from typing import Callable, List, Optional, Tuple 14 15import h5py 16import numpy as np 17import pandas as pd 18from scipy import sparse 19 20from ..dataset_config import DATASET_RATINGS_IMPLICIT, RatingMatrixConfig 21from ..dataset_config import \ 22 DatasetIndexConfig, DatasetMatrixConfig, DatasetTableConfig, create_dataset_table_config 23from ..dataset_constants import TABLE_FILE_PREFIX 24from .dataset_processor_lfm import DatasetProcessorLFM 25 26ALL_MUSIC_GENRES = [ 27 'rnb', 'rap', 'electronic', 'rock', 'new age', 'classical', 'reggae', 'blues', 'country', 28 'world', 'folk', 'easy listening', 'jazz', 'vocal', 'children\'s', 'punk', 'alternative', 29 'spoken word', 'pop', 'heavy metal' 30] 31 32 33class DatasetProcessorLFM1B(DatasetProcessorLFM): 34 """DatasetProcessor for the LastFM-1B dataset. 35 36 The dataset and UGP (user genre profile) can be downloaded from the website below. 37 http://www.cp.jku.at/datasets/LFM-1b/ 38 39 The enriched artist gender information can be retrieved from: 40 https://zenodo.org/record/3748787#.YowEBqhByUk 41 42 The processor handles the following files: 43 44 LFM-1b_albums.txt (optional) 45 LFM-1b_artist_genres_allmusic.txt (optional) 46 LFM-1b_artists.txt (optional) 47 LFM-1b_LEs.mat (required) 48 LFM-1b_LEs.txt (required) 49 LFM-1b_tracks.txt (optional) 50 LFM-1b_UGP_noPC_allmusic.txt (optional) 51 LFM-1b_UGP_weightedPC_allmusic.txt (optional) 52 LFM-1b_users.txt (optional) 53 LFM-1b_users_additional.txt (optional) 54 lfm-gender.json (optional) 55 """ 56 57 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 58 """Create the listening event table configuration. 59 60 Returns: 61 the configuration of the listening event table. 62 """ 63 return create_dataset_table_config( 64 'LFM-1b_LEs.txt', 65 ['user_id', 'artist_id', 'album_id', 'track_id'], 66 ['timestamp'] 67 ) 68 69 def create_user_table_config(self) -> DatasetTableConfig: 70 """Create the user table configuration. 71 72 Returns: 73 the configuration of the user table. 74 """ 75 return create_dataset_table_config( 76 'LFM-1b_users.txt', 77 ['user_id'], 78 ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'], 79 header=True 80 ) 81 82 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 83 """Get matrix configuration processors. 84 85 Returns: 86 a list containing the user-artist-count matrix processor. 87 """ 88 return [('user-artist-count', self.process_user_artist_matrix)] 89 90 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 91 """Get table configuration processors. 92 93 Returns: 94 a list containing the album, allmusic genre, artist, track and user table processors. 95 """ 96 return DatasetProcessorLFM.get_table_configs(self) + [ 97 ('album', self.process_album_table), 98 ('allmusic genre', self.process_genres_allmusic), 99 ('artist', self.process_artist_table), 100 ('track', self.process_track_table), 101 ('user additional', self.process_user_additional_table), 102 ('user allmusic noPC', self.process_user_genre_allmusic_no_pc), 103 ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc), 104 ] 105 106 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 107 """Load the artist gender json file. 108 109 Returns: 110 the loaded artist id/gender table or None on failure. 111 """ 112 try: 113 gender_table = pd.read_json( 114 os.path.join(self.dataset_dir, 'lfm-gender.json'), 115 orient='index' 116 ) 117 gender_table.reset_index(inplace=True) 118 gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True) 119 return gender_table 120 except FileNotFoundError: 121 return None 122 123 def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]: 124 """Load the artist allmusic genres file. 125 126 Returns: 127 the loaded artist name/genre table or None on failure. 128 """ 129 try: 130 genres = pd.read_csv( 131 os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'), 132 sep='\t', 133 names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))] 134 ) 135 except FileNotFoundError: 136 return None 137 138 # remove duplicate rows where artist name is the same 139 genres.drop_duplicates(subset='artist_name', inplace=True) 140 # extract and drop artist name column 141 artist_genres = pd.DataFrame(genres['artist_name']) 142 genres.drop('artist_name', inplace=True, axis=1) 143 144 # map allmusic genre id to genre name 145 for col in genres: 146 genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore') 147 148 # add genres column 149 artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1) 150 151 return artist_genres 152 153 def process_album_table(self) -> Optional[DatasetTableConfig]: 154 """Process the album table. 155 156 Returns: 157 the album table configuration or None on failure. 158 """ 159 album_table_config = create_dataset_table_config( 160 'LFM-1b_albums.txt', 161 ['album_id'], 162 ['album_name'], 163 foreign_keys=['artist_id'] 164 ) 165 166 try: 167 num_records = len(album_table_config.read_table(self.dataset_dir)) 168 album_table_config.num_records = num_records 169 return album_table_config 170 except FileNotFoundError: 171 return None 172 173 def process_artist_table(self) -> Optional[DatasetTableConfig]: 174 """Process the artist table. 175 176 Extends the table with artist gender and genres information when available. 177 178 Returns: 179 the artist table configuration or None on failure. 180 """ 181 artist_table_config = create_dataset_table_config( 182 'LFM-1b_artists.txt', 183 ['artist_id'], 184 ['artist_name'] 185 ) 186 187 try: 188 artist_table = artist_table_config.read_table(self.dataset_dir) 189 except FileNotFoundError: 190 artist_table = pd.DataFrame() 191 artist_table_config.columns.pop() 192 193 # add artist gender when available 194 gender_table = self.load_artist_gender_json() 195 if gender_table is not None: 196 # replace artist table when missing 197 if len(artist_table) == 0: 198 artist_table = gender_table 199 # merge artist table with gender 200 else: 201 artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id') 202 artist_table_config.columns += ['artist_gender'] 203 204 # no need to continue if the previous failed 205 if len(artist_table) == 0: 206 return None 207 208 if 'artist_name' in artist_table_config.columns: 209 # attempt to load artist name / genre table 210 artist_genres = self.load_artist_genres_allmusic() 211 if artist_genres is not None: 212 # merge artist table with genres 213 artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name') 214 artist_table_config.columns += ['artist_genres'] 215 216 artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2' 217 artist_table_config.file.options.compression = 'bz2' 218 artist_table_config.num_records = len(artist_table) 219 220 # store generated artist table 221 artist_table_config.save_table(artist_table, self.dataset_dir) 222 223 return artist_table_config 224 225 def process_genres_allmusic(self) -> Optional[DatasetTableConfig]: 226 """Process the allmusic genres table. 227 228 Returns: 229 the allmusic genres table configuration or None on failure. 230 """ 231 genres_allmusic_table_config = create_dataset_table_config( 232 'genres_allmusic.txt', 233 [], # row number is the primary key 234 ['allmusic_genre'] 235 ) 236 try: 237 genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir) 238 except FileNotFoundError: 239 return None 240 241 # reset index and rename to primary key 242 genres_allmusic_table.reset_index(inplace=True) 243 genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True) 244 245 genres_allmusic_table_config.primary_key = ['allmusic_id'] 246 genres_allmusic_table_config.file.name = \ 247 TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2' 248 genres_allmusic_table_config.file.options.compression = 'bz2' 249 genres_allmusic_table_config.num_records = len(genres_allmusic_table) 250 251 # store generated allmusic genre table 252 genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir) 253 254 return genres_allmusic_table_config 255 256 257 def process_track_table(self) -> Optional[DatasetTableConfig]: 258 """Process the track table. 259 260 Returns: 261 the track table configuration or None on failure. 262 """ 263 track_table_config = create_dataset_table_config( 264 'LFM-1b_tracks.txt', 265 ['track_id'], 266 ['track_name'], 267 foreign_keys=['artist_id'] 268 ) 269 270 try: 271 num_records = len(track_table_config.read_table(self.dataset_dir)) 272 track_table_config.num_records = num_records 273 return track_table_config 274 except FileNotFoundError: 275 return None 276 277 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 278 """Process the user-artist-count matrix. 279 280 The user-item matrix is stored in a matlab file in CSR compatible format, 281 together with the user and item indices. The matrix is converted 282 to a dataframe and the indices for the indirection arrays are flattened. 283 284 Returns: 285 the matrix configuration or None on failure. 286 """ 287 try: 288 mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat') 289 # load matrix as described in the paper 290 csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file) 291 except FileNotFoundError: 292 return None 293 294 matrix_name = 'user-artist-count' 295 296 # create and save user indirection array 297 user_list = list(map(lambda i: i[0], idx_users)) 298 user_index_config = DatasetIndexConfig( 299 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5', 300 'user_id', 301 len(user_list) 302 ) 303 user_index_config.save_indices(self.dataset_dir, user_list) 304 305 # create and save artist indirection array 306 artist_list = list(map(lambda i: i[0], idx_artists)) 307 artist_index_config = DatasetIndexConfig( 308 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5', 309 'artist_id', 310 len(artist_list) 311 ) 312 artist_index_config.save_indices(self.dataset_dir, artist_list) 313 314 # convert csr to dataframe 315 coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo() 316 user_artist_matrix = pd.DataFrame() 317 user_artist_matrix['user_id'] = coo_matrix.row 318 user_artist_matrix['artist_id'] = coo_matrix.col 319 user_artist_matrix['matrix_count'] = coo_matrix.data 320 321 # create matrix table configuration 322 user_artist_table_config = create_dataset_table_config( 323 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2', 324 ['user_id', 'artist_id'], 325 ['matrix_count'], 326 compression='bz2', 327 foreign_keys=['user_id', 'artist_id'], 328 num_records=len(user_artist_matrix) 329 ) 330 331 # store the resulting matrix 332 user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir) 333 334 return DatasetMatrixConfig( 335 user_artist_table_config, 336 RatingMatrixConfig( 337 user_artist_matrix['matrix_count'].min(), 338 user_artist_matrix['matrix_count'].max(), 339 DATASET_RATINGS_IMPLICIT 340 ), 341 user_index_config, 342 artist_index_config 343 ) 344 345 def process_user_additional_table(self) -> Optional[DatasetTableConfig]: 346 """Process the user additional table. 347 348 Returns: 349 the user additional table configuration or None on failure. 350 """ 351 columns = [ 352 'user_novelty artist avg month', 353 'user_novelty artist avg 6months', 354 'user_novelty artist avg year', 355 'user_mainstreaminess avg month', 356 'user_mainstreaminess avg 6months', 357 'user_mainstreaminess avg year', 358 'user_mainstreaminess global', 359 'user_count LEs', 360 'user_count distinct tracks', 361 'user_count distinct artists', 362 'user_count LEs per week' 363 ] 364 365 for i in range(1, 8): 366 columns += ['user_relative LE per weekday' + str(i)] 367 for i in range(0, 24): 368 columns += ['user_relative LE per hour' + str(i)] 369 370 user_additional_table_config = create_dataset_table_config( 371 'LFM-1b_users_additional.txt', 372 ['user_id'], 373 columns, 374 header=True 375 ) 376 377 try: 378 num_records = len(user_additional_table_config.read_table(self.dataset_dir)) 379 user_additional_table_config.num_records = num_records 380 return user_additional_table_config 381 except FileNotFoundError: 382 return None 383 384 def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]: 385 """Process the user allmusic genre table. 386 387 Returns: 388 the user allmusic genre table configuration or None on failure. 389 """ 390 columns = [] 391 for genre_name in ALL_MUSIC_GENRES: 392 columns += ['noPC_' + genre_name] 393 394 user_genre_allmusic_no_pc_config = create_dataset_table_config( 395 'LFM-1b_UGP_noPC_allmusic.txt', 396 ['user_id'], 397 columns, 398 header=True 399 ) 400 try: 401 num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir)) 402 user_genre_allmusic_no_pc_config.num_records = num_records 403 return user_genre_allmusic_no_pc_config 404 except FileNotFoundError: 405 return None 406 407 def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]: 408 """Process the user allmusic genre table with weighted play count. 409 410 Returns: 411 the user allmusic genre table configuration or None on failure. 412 """ 413 columns = [] 414 for genre_name in ALL_MUSIC_GENRES: 415 columns += ['weightedPC_' + genre_name] 416 417 user_genre_allmusic_weighted_pc_config = create_dataset_table_config( 418 'LFM-1b_UGP_noPC_allmusic.txt', 419 ['user_id'], 420 columns, 421 header=True 422 ) 423 try: 424 num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir)) 425 user_genre_allmusic_weighted_pc_config.num_records = num_records 426 return user_genre_allmusic_weighted_pc_config 427 except FileNotFoundError: 428 return None 429 430 431def _load_lfm_1b_mat(file_path: str) -> Tuple[sparse.csr_matrix, np.array, np.array]: 432 """Load the LFM-1B dataset from the matlab file. 433 434 Args: 435 file_path: the path to the matlab file. 436 437 Returns: 438 the matrix and user / artist indirection arrays. 439 """ 440 with h5py.File(file_path, 'r') as mat_file: 441 csr_matrix = sparse.csr_matrix(( 442 mat_file['/LEs/']["data"], 443 mat_file['/LEs/']["ir"], 444 mat_file['/LEs/']["jc"] 445 )).transpose() 446 idx_users = np.array(mat_file.get('idx_users')).astype(np.int64) 447 idx_artists = np.array(mat_file.get('idx_artists')).astype(np.int64) 448 return csr_matrix, idx_users, idx_artists
34class DatasetProcessorLFM1B(DatasetProcessorLFM): 35 """DatasetProcessor for the LastFM-1B dataset. 36 37 The dataset and UGP (user genre profile) can be downloaded from the website below. 38 http://www.cp.jku.at/datasets/LFM-1b/ 39 40 The enriched artist gender information can be retrieved from: 41 https://zenodo.org/record/3748787#.YowEBqhByUk 42 43 The processor handles the following files: 44 45 LFM-1b_albums.txt (optional) 46 LFM-1b_artist_genres_allmusic.txt (optional) 47 LFM-1b_artists.txt (optional) 48 LFM-1b_LEs.mat (required) 49 LFM-1b_LEs.txt (required) 50 LFM-1b_tracks.txt (optional) 51 LFM-1b_UGP_noPC_allmusic.txt (optional) 52 LFM-1b_UGP_weightedPC_allmusic.txt (optional) 53 LFM-1b_users.txt (optional) 54 LFM-1b_users_additional.txt (optional) 55 lfm-gender.json (optional) 56 """ 57 58 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 59 """Create the listening event table configuration. 60 61 Returns: 62 the configuration of the listening event table. 63 """ 64 return create_dataset_table_config( 65 'LFM-1b_LEs.txt', 66 ['user_id', 'artist_id', 'album_id', 'track_id'], 67 ['timestamp'] 68 ) 69 70 def create_user_table_config(self) -> DatasetTableConfig: 71 """Create the user table configuration. 72 73 Returns: 74 the configuration of the user table. 75 """ 76 return create_dataset_table_config( 77 'LFM-1b_users.txt', 78 ['user_id'], 79 ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'], 80 header=True 81 ) 82 83 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 84 """Get matrix configuration processors. 85 86 Returns: 87 a list containing the user-artist-count matrix processor. 88 """ 89 return [('user-artist-count', self.process_user_artist_matrix)] 90 91 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 92 """Get table configuration processors. 93 94 Returns: 95 a list containing the album, allmusic genre, artist, track and user table processors. 96 """ 97 return DatasetProcessorLFM.get_table_configs(self) + [ 98 ('album', self.process_album_table), 99 ('allmusic genre', self.process_genres_allmusic), 100 ('artist', self.process_artist_table), 101 ('track', self.process_track_table), 102 ('user additional', self.process_user_additional_table), 103 ('user allmusic noPC', self.process_user_genre_allmusic_no_pc), 104 ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc), 105 ] 106 107 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 108 """Load the artist gender json file. 109 110 Returns: 111 the loaded artist id/gender table or None on failure. 112 """ 113 try: 114 gender_table = pd.read_json( 115 os.path.join(self.dataset_dir, 'lfm-gender.json'), 116 orient='index' 117 ) 118 gender_table.reset_index(inplace=True) 119 gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True) 120 return gender_table 121 except FileNotFoundError: 122 return None 123 124 def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]: 125 """Load the artist allmusic genres file. 126 127 Returns: 128 the loaded artist name/genre table or None on failure. 129 """ 130 try: 131 genres = pd.read_csv( 132 os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'), 133 sep='\t', 134 names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))] 135 ) 136 except FileNotFoundError: 137 return None 138 139 # remove duplicate rows where artist name is the same 140 genres.drop_duplicates(subset='artist_name', inplace=True) 141 # extract and drop artist name column 142 artist_genres = pd.DataFrame(genres['artist_name']) 143 genres.drop('artist_name', inplace=True, axis=1) 144 145 # map allmusic genre id to genre name 146 for col in genres: 147 genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore') 148 149 # add genres column 150 artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1) 151 152 return artist_genres 153 154 def process_album_table(self) -> Optional[DatasetTableConfig]: 155 """Process the album table. 156 157 Returns: 158 the album table configuration or None on failure. 159 """ 160 album_table_config = create_dataset_table_config( 161 'LFM-1b_albums.txt', 162 ['album_id'], 163 ['album_name'], 164 foreign_keys=['artist_id'] 165 ) 166 167 try: 168 num_records = len(album_table_config.read_table(self.dataset_dir)) 169 album_table_config.num_records = num_records 170 return album_table_config 171 except FileNotFoundError: 172 return None 173 174 def process_artist_table(self) -> Optional[DatasetTableConfig]: 175 """Process the artist table. 176 177 Extends the table with artist gender and genres information when available. 178 179 Returns: 180 the artist table configuration or None on failure. 181 """ 182 artist_table_config = create_dataset_table_config( 183 'LFM-1b_artists.txt', 184 ['artist_id'], 185 ['artist_name'] 186 ) 187 188 try: 189 artist_table = artist_table_config.read_table(self.dataset_dir) 190 except FileNotFoundError: 191 artist_table = pd.DataFrame() 192 artist_table_config.columns.pop() 193 194 # add artist gender when available 195 gender_table = self.load_artist_gender_json() 196 if gender_table is not None: 197 # replace artist table when missing 198 if len(artist_table) == 0: 199 artist_table = gender_table 200 # merge artist table with gender 201 else: 202 artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id') 203 artist_table_config.columns += ['artist_gender'] 204 205 # no need to continue if the previous failed 206 if len(artist_table) == 0: 207 return None 208 209 if 'artist_name' in artist_table_config.columns: 210 # attempt to load artist name / genre table 211 artist_genres = self.load_artist_genres_allmusic() 212 if artist_genres is not None: 213 # merge artist table with genres 214 artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name') 215 artist_table_config.columns += ['artist_genres'] 216 217 artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2' 218 artist_table_config.file.options.compression = 'bz2' 219 artist_table_config.num_records = len(artist_table) 220 221 # store generated artist table 222 artist_table_config.save_table(artist_table, self.dataset_dir) 223 224 return artist_table_config 225 226 def process_genres_allmusic(self) -> Optional[DatasetTableConfig]: 227 """Process the allmusic genres table. 228 229 Returns: 230 the allmusic genres table configuration or None on failure. 231 """ 232 genres_allmusic_table_config = create_dataset_table_config( 233 'genres_allmusic.txt', 234 [], # row number is the primary key 235 ['allmusic_genre'] 236 ) 237 try: 238 genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir) 239 except FileNotFoundError: 240 return None 241 242 # reset index and rename to primary key 243 genres_allmusic_table.reset_index(inplace=True) 244 genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True) 245 246 genres_allmusic_table_config.primary_key = ['allmusic_id'] 247 genres_allmusic_table_config.file.name = \ 248 TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2' 249 genres_allmusic_table_config.file.options.compression = 'bz2' 250 genres_allmusic_table_config.num_records = len(genres_allmusic_table) 251 252 # store generated allmusic genre table 253 genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir) 254 255 return genres_allmusic_table_config 256 257 258 def process_track_table(self) -> Optional[DatasetTableConfig]: 259 """Process the track table. 260 261 Returns: 262 the track table configuration or None on failure. 263 """ 264 track_table_config = create_dataset_table_config( 265 'LFM-1b_tracks.txt', 266 ['track_id'], 267 ['track_name'], 268 foreign_keys=['artist_id'] 269 ) 270 271 try: 272 num_records = len(track_table_config.read_table(self.dataset_dir)) 273 track_table_config.num_records = num_records 274 return track_table_config 275 except FileNotFoundError: 276 return None 277 278 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 279 """Process the user-artist-count matrix. 280 281 The user-item matrix is stored in a matlab file in CSR compatible format, 282 together with the user and item indices. The matrix is converted 283 to a dataframe and the indices for the indirection arrays are flattened. 284 285 Returns: 286 the matrix configuration or None on failure. 287 """ 288 try: 289 mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat') 290 # load matrix as described in the paper 291 csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file) 292 except FileNotFoundError: 293 return None 294 295 matrix_name = 'user-artist-count' 296 297 # create and save user indirection array 298 user_list = list(map(lambda i: i[0], idx_users)) 299 user_index_config = DatasetIndexConfig( 300 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5', 301 'user_id', 302 len(user_list) 303 ) 304 user_index_config.save_indices(self.dataset_dir, user_list) 305 306 # create and save artist indirection array 307 artist_list = list(map(lambda i: i[0], idx_artists)) 308 artist_index_config = DatasetIndexConfig( 309 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5', 310 'artist_id', 311 len(artist_list) 312 ) 313 artist_index_config.save_indices(self.dataset_dir, artist_list) 314 315 # convert csr to dataframe 316 coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo() 317 user_artist_matrix = pd.DataFrame() 318 user_artist_matrix['user_id'] = coo_matrix.row 319 user_artist_matrix['artist_id'] = coo_matrix.col 320 user_artist_matrix['matrix_count'] = coo_matrix.data 321 322 # create matrix table configuration 323 user_artist_table_config = create_dataset_table_config( 324 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2', 325 ['user_id', 'artist_id'], 326 ['matrix_count'], 327 compression='bz2', 328 foreign_keys=['user_id', 'artist_id'], 329 num_records=len(user_artist_matrix) 330 ) 331 332 # store the resulting matrix 333 user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir) 334 335 return DatasetMatrixConfig( 336 user_artist_table_config, 337 RatingMatrixConfig( 338 user_artist_matrix['matrix_count'].min(), 339 user_artist_matrix['matrix_count'].max(), 340 DATASET_RATINGS_IMPLICIT 341 ), 342 user_index_config, 343 artist_index_config 344 ) 345 346 def process_user_additional_table(self) -> Optional[DatasetTableConfig]: 347 """Process the user additional table. 348 349 Returns: 350 the user additional table configuration or None on failure. 351 """ 352 columns = [ 353 'user_novelty artist avg month', 354 'user_novelty artist avg 6months', 355 'user_novelty artist avg year', 356 'user_mainstreaminess avg month', 357 'user_mainstreaminess avg 6months', 358 'user_mainstreaminess avg year', 359 'user_mainstreaminess global', 360 'user_count LEs', 361 'user_count distinct tracks', 362 'user_count distinct artists', 363 'user_count LEs per week' 364 ] 365 366 for i in range(1, 8): 367 columns += ['user_relative LE per weekday' + str(i)] 368 for i in range(0, 24): 369 columns += ['user_relative LE per hour' + str(i)] 370 371 user_additional_table_config = create_dataset_table_config( 372 'LFM-1b_users_additional.txt', 373 ['user_id'], 374 columns, 375 header=True 376 ) 377 378 try: 379 num_records = len(user_additional_table_config.read_table(self.dataset_dir)) 380 user_additional_table_config.num_records = num_records 381 return user_additional_table_config 382 except FileNotFoundError: 383 return None 384 385 def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]: 386 """Process the user allmusic genre table. 387 388 Returns: 389 the user allmusic genre table configuration or None on failure. 390 """ 391 columns = [] 392 for genre_name in ALL_MUSIC_GENRES: 393 columns += ['noPC_' + genre_name] 394 395 user_genre_allmusic_no_pc_config = create_dataset_table_config( 396 'LFM-1b_UGP_noPC_allmusic.txt', 397 ['user_id'], 398 columns, 399 header=True 400 ) 401 try: 402 num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir)) 403 user_genre_allmusic_no_pc_config.num_records = num_records 404 return user_genre_allmusic_no_pc_config 405 except FileNotFoundError: 406 return None 407 408 def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]: 409 """Process the user allmusic genre table with weighted play count. 410 411 Returns: 412 the user allmusic genre table configuration or None on failure. 413 """ 414 columns = [] 415 for genre_name in ALL_MUSIC_GENRES: 416 columns += ['weightedPC_' + genre_name] 417 418 user_genre_allmusic_weighted_pc_config = create_dataset_table_config( 419 'LFM-1b_UGP_noPC_allmusic.txt', 420 ['user_id'], 421 columns, 422 header=True 423 ) 424 try: 425 num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir)) 426 user_genre_allmusic_weighted_pc_config.num_records = num_records 427 return user_genre_allmusic_weighted_pc_config 428 except FileNotFoundError: 429 return None
DatasetProcessor for the LastFM-1B dataset.
The dataset and UGP (user genre profile) can be downloaded from the website below. http://www.cp.jku.at/datasets/LFM-1b/
The enriched artist gender information can be retrieved from: https://zenodo.org/record/3748787#.YowEBqhByUk
The processor handles the following files:
LFM-1b_albums.txt (optional) LFM-1b_artist_genres_allmusic.txt (optional) LFM-1b_artists.txt (optional) LFM-1b_LEs.mat (required) LFM-1b_LEs.txt (required) LFM-1b_tracks.txt (optional) LFM-1b_UGP_noPC_allmusic.txt (optional) LFM-1b_UGP_weightedPC_allmusic.txt (optional) LFM-1b_users.txt (optional) LFM-1b_users_additional.txt (optional) lfm-gender.json (optional)
58 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 59 """Create the listening event table configuration. 60 61 Returns: 62 the configuration of the listening event table. 63 """ 64 return create_dataset_table_config( 65 'LFM-1b_LEs.txt', 66 ['user_id', 'artist_id', 'album_id', 'track_id'], 67 ['timestamp'] 68 )
Create the listening event table configuration.
Returns: the configuration of the listening event table.
70 def create_user_table_config(self) -> DatasetTableConfig: 71 """Create the user table configuration. 72 73 Returns: 74 the configuration of the user table. 75 """ 76 return create_dataset_table_config( 77 'LFM-1b_users.txt', 78 ['user_id'], 79 ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'], 80 header=True 81 )
Create the user table configuration.
Returns: the configuration of the user table.
83 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 84 """Get matrix configuration processors. 85 86 Returns: 87 a list containing the user-artist-count matrix processor. 88 """ 89 return [('user-artist-count', self.process_user_artist_matrix)]
Get matrix configuration processors.
Returns: a list containing the user-artist-count matrix processor.
91 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 92 """Get table configuration processors. 93 94 Returns: 95 a list containing the album, allmusic genre, artist, track and user table processors. 96 """ 97 return DatasetProcessorLFM.get_table_configs(self) + [ 98 ('album', self.process_album_table), 99 ('allmusic genre', self.process_genres_allmusic), 100 ('artist', self.process_artist_table), 101 ('track', self.process_track_table), 102 ('user additional', self.process_user_additional_table), 103 ('user allmusic noPC', self.process_user_genre_allmusic_no_pc), 104 ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc), 105 ]
Get table configuration processors.
Returns: a list containing the album, allmusic genre, artist, track and user table processors.
107 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 108 """Load the artist gender json file. 109 110 Returns: 111 the loaded artist id/gender table or None on failure. 112 """ 113 try: 114 gender_table = pd.read_json( 115 os.path.join(self.dataset_dir, 'lfm-gender.json'), 116 orient='index' 117 ) 118 gender_table.reset_index(inplace=True) 119 gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True) 120 return gender_table 121 except FileNotFoundError: 122 return None
Load the artist gender json file.
Returns: the loaded artist id/gender table or None on failure.
124 def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]: 125 """Load the artist allmusic genres file. 126 127 Returns: 128 the loaded artist name/genre table or None on failure. 129 """ 130 try: 131 genres = pd.read_csv( 132 os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'), 133 sep='\t', 134 names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))] 135 ) 136 except FileNotFoundError: 137 return None 138 139 # remove duplicate rows where artist name is the same 140 genres.drop_duplicates(subset='artist_name', inplace=True) 141 # extract and drop artist name column 142 artist_genres = pd.DataFrame(genres['artist_name']) 143 genres.drop('artist_name', inplace=True, axis=1) 144 145 # map allmusic genre id to genre name 146 for col in genres: 147 genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore') 148 149 # add genres column 150 artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1) 151 152 return artist_genres
Load the artist allmusic genres file.
Returns: the loaded artist name/genre table or None on failure.
154 def process_album_table(self) -> Optional[DatasetTableConfig]: 155 """Process the album table. 156 157 Returns: 158 the album table configuration or None on failure. 159 """ 160 album_table_config = create_dataset_table_config( 161 'LFM-1b_albums.txt', 162 ['album_id'], 163 ['album_name'], 164 foreign_keys=['artist_id'] 165 ) 166 167 try: 168 num_records = len(album_table_config.read_table(self.dataset_dir)) 169 album_table_config.num_records = num_records 170 return album_table_config 171 except FileNotFoundError: 172 return None
Process the album table.
Returns: the album table configuration or None on failure.
174 def process_artist_table(self) -> Optional[DatasetTableConfig]: 175 """Process the artist table. 176 177 Extends the table with artist gender and genres information when available. 178 179 Returns: 180 the artist table configuration or None on failure. 181 """ 182 artist_table_config = create_dataset_table_config( 183 'LFM-1b_artists.txt', 184 ['artist_id'], 185 ['artist_name'] 186 ) 187 188 try: 189 artist_table = artist_table_config.read_table(self.dataset_dir) 190 except FileNotFoundError: 191 artist_table = pd.DataFrame() 192 artist_table_config.columns.pop() 193 194 # add artist gender when available 195 gender_table = self.load_artist_gender_json() 196 if gender_table is not None: 197 # replace artist table when missing 198 if len(artist_table) == 0: 199 artist_table = gender_table 200 # merge artist table with gender 201 else: 202 artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id') 203 artist_table_config.columns += ['artist_gender'] 204 205 # no need to continue if the previous failed 206 if len(artist_table) == 0: 207 return None 208 209 if 'artist_name' in artist_table_config.columns: 210 # attempt to load artist name / genre table 211 artist_genres = self.load_artist_genres_allmusic() 212 if artist_genres is not None: 213 # merge artist table with genres 214 artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name') 215 artist_table_config.columns += ['artist_genres'] 216 217 artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2' 218 artist_table_config.file.options.compression = 'bz2' 219 artist_table_config.num_records = len(artist_table) 220 221 # store generated artist table 222 artist_table_config.save_table(artist_table, self.dataset_dir) 223 224 return artist_table_config
Process the artist table.
Extends the table with artist gender and genres information when available.
Returns: the artist table configuration or None on failure.
226 def process_genres_allmusic(self) -> Optional[DatasetTableConfig]: 227 """Process the allmusic genres table. 228 229 Returns: 230 the allmusic genres table configuration or None on failure. 231 """ 232 genres_allmusic_table_config = create_dataset_table_config( 233 'genres_allmusic.txt', 234 [], # row number is the primary key 235 ['allmusic_genre'] 236 ) 237 try: 238 genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir) 239 except FileNotFoundError: 240 return None 241 242 # reset index and rename to primary key 243 genres_allmusic_table.reset_index(inplace=True) 244 genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True) 245 246 genres_allmusic_table_config.primary_key = ['allmusic_id'] 247 genres_allmusic_table_config.file.name = \ 248 TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2' 249 genres_allmusic_table_config.file.options.compression = 'bz2' 250 genres_allmusic_table_config.num_records = len(genres_allmusic_table) 251 252 # store generated allmusic genre table 253 genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir) 254 255 return genres_allmusic_table_config
Process the allmusic genres table.
Returns: the allmusic genres table configuration or None on failure.
258 def process_track_table(self) -> Optional[DatasetTableConfig]: 259 """Process the track table. 260 261 Returns: 262 the track table configuration or None on failure. 263 """ 264 track_table_config = create_dataset_table_config( 265 'LFM-1b_tracks.txt', 266 ['track_id'], 267 ['track_name'], 268 foreign_keys=['artist_id'] 269 ) 270 271 try: 272 num_records = len(track_table_config.read_table(self.dataset_dir)) 273 track_table_config.num_records = num_records 274 return track_table_config 275 except FileNotFoundError: 276 return None
Process the track table.
Returns: the track table configuration or None on failure.
278 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 279 """Process the user-artist-count matrix. 280 281 The user-item matrix is stored in a matlab file in CSR compatible format, 282 together with the user and item indices. The matrix is converted 283 to a dataframe and the indices for the indirection arrays are flattened. 284 285 Returns: 286 the matrix configuration or None on failure. 287 """ 288 try: 289 mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat') 290 # load matrix as described in the paper 291 csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file) 292 except FileNotFoundError: 293 return None 294 295 matrix_name = 'user-artist-count' 296 297 # create and save user indirection array 298 user_list = list(map(lambda i: i[0], idx_users)) 299 user_index_config = DatasetIndexConfig( 300 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5', 301 'user_id', 302 len(user_list) 303 ) 304 user_index_config.save_indices(self.dataset_dir, user_list) 305 306 # create and save artist indirection array 307 artist_list = list(map(lambda i: i[0], idx_artists)) 308 artist_index_config = DatasetIndexConfig( 309 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5', 310 'artist_id', 311 len(artist_list) 312 ) 313 artist_index_config.save_indices(self.dataset_dir, artist_list) 314 315 # convert csr to dataframe 316 coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo() 317 user_artist_matrix = pd.DataFrame() 318 user_artist_matrix['user_id'] = coo_matrix.row 319 user_artist_matrix['artist_id'] = coo_matrix.col 320 user_artist_matrix['matrix_count'] = coo_matrix.data 321 322 # create matrix table configuration 323 user_artist_table_config = create_dataset_table_config( 324 TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2', 325 ['user_id', 'artist_id'], 326 ['matrix_count'], 327 compression='bz2', 328 foreign_keys=['user_id', 'artist_id'], 329 num_records=len(user_artist_matrix) 330 ) 331 332 # store the resulting matrix 333 user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir) 334 335 return DatasetMatrixConfig( 336 user_artist_table_config, 337 RatingMatrixConfig( 338 user_artist_matrix['matrix_count'].min(), 339 user_artist_matrix['matrix_count'].max(), 340 DATASET_RATINGS_IMPLICIT 341 ), 342 user_index_config, 343 artist_index_config 344 )
Process the user-artist-count matrix.
The user-item matrix is stored in a matlab file in CSR compatible format, together with the user and item indices. The matrix is converted to a dataframe and the indices for the indirection arrays are flattened.
Returns: the matrix configuration or None on failure.
346 def process_user_additional_table(self) -> Optional[DatasetTableConfig]: 347 """Process the user additional table. 348 349 Returns: 350 the user additional table configuration or None on failure. 351 """ 352 columns = [ 353 'user_novelty artist avg month', 354 'user_novelty artist avg 6months', 355 'user_novelty artist avg year', 356 'user_mainstreaminess avg month', 357 'user_mainstreaminess avg 6months', 358 'user_mainstreaminess avg year', 359 'user_mainstreaminess global', 360 'user_count LEs', 361 'user_count distinct tracks', 362 'user_count distinct artists', 363 'user_count LEs per week' 364 ] 365 366 for i in range(1, 8): 367 columns += ['user_relative LE per weekday' + str(i)] 368 for i in range(0, 24): 369 columns += ['user_relative LE per hour' + str(i)] 370 371 user_additional_table_config = create_dataset_table_config( 372 'LFM-1b_users_additional.txt', 373 ['user_id'], 374 columns, 375 header=True 376 ) 377 378 try: 379 num_records = len(user_additional_table_config.read_table(self.dataset_dir)) 380 user_additional_table_config.num_records = num_records 381 return user_additional_table_config 382 except FileNotFoundError: 383 return None
Process the user additional table.
Returns: the user additional table configuration or None on failure.
385 def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]: 386 """Process the user allmusic genre table. 387 388 Returns: 389 the user allmusic genre table configuration or None on failure. 390 """ 391 columns = [] 392 for genre_name in ALL_MUSIC_GENRES: 393 columns += ['noPC_' + genre_name] 394 395 user_genre_allmusic_no_pc_config = create_dataset_table_config( 396 'LFM-1b_UGP_noPC_allmusic.txt', 397 ['user_id'], 398 columns, 399 header=True 400 ) 401 try: 402 num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir)) 403 user_genre_allmusic_no_pc_config.num_records = num_records 404 return user_genre_allmusic_no_pc_config 405 except FileNotFoundError: 406 return None
Process the user allmusic genre table.
Returns: the user allmusic genre table configuration or None on failure.
408 def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]: 409 """Process the user allmusic genre table with weighted play count. 410 411 Returns: 412 the user allmusic genre table configuration or None on failure. 413 """ 414 columns = [] 415 for genre_name in ALL_MUSIC_GENRES: 416 columns += ['weightedPC_' + genre_name] 417 418 user_genre_allmusic_weighted_pc_config = create_dataset_table_config( 419 'LFM-1b_UGP_noPC_allmusic.txt', 420 ['user_id'], 421 columns, 422 header=True 423 ) 424 try: 425 num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir)) 426 user_genre_allmusic_weighted_pc_config.num_records = num_records 427 return user_genre_allmusic_weighted_pc_config 428 except FileNotFoundError: 429 return None
Process the user allmusic genre table with weighted play count.
Returns: the user allmusic genre table configuration or None on failure.