src.fairreckitlib.data.set.processor.dataset_processor_lfm360k
This modules contains the class to process the LastFM-360K dataset.
Classes:
DatasetProcessorLFM360K: data processor implementation for the LFM-360K dataset.
This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)
1"""This modules contains the class to process the LastFM-360K dataset. 2 3Classes: 4 5 DatasetProcessorLFM360K: data processor implementation for the LFM-360K dataset. 6 7This program has been developed by students from the bachelor Computer Science at 8Utrecht University within the Software Project course. 9© Copyright Utrecht University (Department of Information and Computing Sciences) 10""" 11 12import os 13from typing import Callable, List, Optional, Tuple 14 15import pandas as pd 16 17from ..dataset_config import create_dataset_table_config, DatasetMatrixConfig, DatasetTableConfig 18from ..dataset_constants import TABLE_FILE_PREFIX 19from .dataset_processor_lfm import DatasetProcessorLFM 20 21 22class DatasetProcessorLFM360K(DatasetProcessorLFM): 23 """DatasetProcessor for the LastFM-360K dataset. 24 25 The dataset can be downloaded from the website below. 26 https://www.upf.edu/web/mtg/lastfm360k 27 28 The enriched artist gender information can be retrieved from: 29 https://zenodo.org/record/3748787#.YowEBqhByUk 30 31 The processor handles the following files: 32 33 usersha1-artmbid-artname-plays.tsv (required) 34 usersha1-profile.tsv (optional) 35 lfm-360-gender.json (optional) 36 """ 37 38 def __init__(self, dataset_dir: str, dataset_name: str): 39 """Construct the DatasetProcessorLFM360K. 40 41 Args: 42 dataset_name: path of the dataset directory. 43 dataset_name: name of the dataset (processor). 44 """ 45 DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name) 46 # buffer for the user sha and artist name lists 47 self.user_list = None 48 self.artist_list = None 49 # buffer for the artist name/musicbrainzID dataframe 50 self.artist_mb_id = None 51 52 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 53 """Create the listening event table configuration. 54 55 No listening events are available for this dataset. 56 57 Returns: 58 None. 59 """ 60 return None 61 62 def create_user_table_config(self) -> DatasetTableConfig: 63 """Create the user table configuration. 64 65 The base user configuration that contains the generated user ids 66 and corresponding user sha. 67 68 Returns: 69 the configuration of the user table. 70 """ 71 return create_dataset_table_config( 72 TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2', 73 ['user_id'], 74 ['user_sha'], 75 compression='bz2', 76 num_records=len(self.user_list) 77 ) 78 79 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 80 """Get matrix configuration processors. 81 82 Returns: 83 a list containing the user-artist-count matrix processor. 84 """ 85 return [('user-artist-count', self.process_user_artist_matrix)] 86 87 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 88 """Get table configuration processors. 89 90 Returns: 91 a list containing the artist and user table processors. 92 """ 93 return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)] 94 95 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 96 """Load the artist gender json file. 97 98 Returns: 99 the loaded artist musicbrainzID/gender table or None on failure. 100 """ 101 try: 102 gender_table = pd.read_json( 103 os.path.join(self.dataset_dir, 'lfm-360-gender.json'), 104 orient='index' 105 ) 106 gender_table.reset_index(inplace=True) 107 gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True) 108 return gender_table 109 except FileNotFoundError: 110 return None 111 112 def load_user_table(self) -> Optional[pd.DataFrame]: 113 """Load the original user table. 114 115 Changes the contents of the age and gender columns to be more user-friendly, 116 and the contents of the country column to ISO 3166 Alpha-2 country codes. 117 118 Returns: 119 the loaded user table on None on failure. 120 """ 121 user_table_columns = [ 122 'user_sha', 123 'user_gender', 124 'user_age', 125 'user_country', 126 'user_signup' 127 ] 128 129 try: 130 # load original user table 131 user_table = pd.read_table( 132 os.path.join(self.dataset_dir, 'usersha1-profile.tsv'), 133 names=user_table_columns, 134 sep='\t' 135 ) 136 except FileNotFoundError: 137 return None 138 139 # mask user age not between 1-100 as NaN 140 user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True) 141 user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True) 142 143 # convert gender to more user-friendly names 144 user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True) 145 146 # convert country to ISO 3166 Alpha-2 country code 147 user_table['user_country'].replace({ 148 'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS', 149 'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ', 150 'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW', 151 'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ', 152 'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB', 153 'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM', 154 'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW', 155 'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO', 156 'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI', 157 'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV', 158 'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL', 159 'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC', 160 'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ', 161 'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY', 162 'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU', 163 'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO', 164 'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ', 165 'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET', 166 'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ', 167 'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF', 168 'French Southern Territories': 'TF', 169 'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH', 170 'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL', 171 'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW', 172 'Guyana': 'GY', 173 'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU', 174 'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK', 175 'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR', 176 'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT', 177 'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO', 178 'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG', 179 'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW', 180 'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB', 181 'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY', 182 'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU', 183 'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT', 184 'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR', 185 'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT', 186 'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC', 187 'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat': 'MS', 'Morocco': 'MA', 188 'Mozambique': 'MZ', 'Myanmar': 'MM', 189 'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP', 190 'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC', 191 'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU', 192 'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO', 193 'Oman': 'OM', 194 'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA', 195 'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH', 196 'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR', 197 'Qatar': 'QA', 198 'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW', 199 'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC', 200 'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC', 201 'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA', 202 'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL', 203 'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA', 204 'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS', 205 'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR', 206 'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ', 207 'Sweden': 'SE', 'Switzerland': 'CH', 208 'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ', 209 'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO', 210 'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM', 211 'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV', 212 'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE', 213 'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM', 214 'Uruguay': 'UY', 'Uzbekistan': 'UZ', 215 'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG', 216 'Virgin Islands, U.s.': 'VI', 217 'Yemen': 'YE', 218 'Wallis and Futuna': 'WF', 'Western Sahara': 'EH', 219 'Zambia': 'ZM', 'Zimbabwe': 'ZW' 220 }, inplace=True) 221 222 return user_table 223 224 def process_artist_table(self) -> Optional[DatasetTableConfig]: 225 """Process the artist table. 226 227 Creates the artist table with the musicbrainzID and gender information when available. 228 229 Returns: 230 the artist table configuration or None on failure. 231 """ 232 artist_key = ['artist_id'] 233 artist_columns = ['artist_name'] 234 235 # connect artist id to name 236 artist_table = pd.DataFrame( 237 list(enumerate(self.artist_list)), 238 columns=artist_key + artist_columns 239 ) 240 241 # merge the artist musicbrainzID on name 242 artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name') 243 artist_table['artist_mbID'].fillna(-1, inplace=True) 244 artist_columns += ['artist_mbID'] 245 246 artist_gender = self.load_artist_gender_json() 247 if artist_gender is not None: 248 # merge artists with gender and update columns 249 artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID') 250 artist_columns += ['artist_gender'] 251 252 # create artist table configuration 253 artist_table_config = create_dataset_table_config( 254 TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2', 255 artist_key, 256 artist_columns, 257 compression='bz2', 258 num_records=len(self.artist_list) 259 ) 260 261 # store the generated artist table 262 artist_table_config.save_table(artist_table, self.dataset_dir) 263 264 return artist_table_config 265 266 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 267 """Process the user-artist-count matrix. 268 269 The user-item matrix is stored in a file that also contains a musicbrainzID. 270 The users are hashes and the items are names, both are converted to integers 271 to comply to the CSR compatible format. In addition, any rows that contain 272 corrupt data are removed in the process. 273 274 Returns: 275 the matrix configuration or None on failure. 276 """ 277 try: 278 dataframe = pd.read_table( 279 os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'), 280 names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count'] 281 ) 282 except FileNotFoundError: 283 return None 284 285 # remove rows from a user that is not a hash 286 dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008'] 287 288 # map users/items to category and ratings to be floating-point 289 dataframe['user_sha'] = dataframe['user_sha'].astype("category") 290 dataframe['artist_name'] = dataframe['artist_name'].astype("category") 291 dataframe['matrix_count'] = dataframe['matrix_count'].astype(float) 292 293 # remove rows that contain items that failed to map to category 294 dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0] 295 # remove rows that have unusable ratings 296 dataframe = dataframe[dataframe['matrix_count'] > 0] 297 298 dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True) 299 300 # extract user/item indirection arrays 301 self.user_list = list(dataframe['user_sha'].cat.categories) 302 self.artist_list = list(dataframe['artist_name'].cat.categories) 303 304 # extract artist name/musicbrainzID combinations 305 self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']] 306 # remove duplicates combinations 307 self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna() 308 # remove duplicates where the artist has more than one musicbrainzID 309 self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name') 310 311 # add the correct user/item integers 312 dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy() 313 dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy() 314 315 # create matrix by removing other columns 316 user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']] 317 user_artist_matrix_table_config = create_dataset_table_config( 318 TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2', 319 ['user_id', 'artist_id'], 320 ['matrix_count'], 321 compression='bz2', 322 foreign_keys=['user_id', 'artist_id'] 323 ) 324 325 # store the resulting matrix 326 user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir) 327 328 return self.process_matrix(user_artist_matrix_table_config) 329 330 def process_user_table(self) -> Optional[DatasetTableConfig]: 331 """Process the user table. 332 333 Extends the original user table with unique user ids. 334 335 Returns: 336 the user table configuration or None on failure. 337 """ 338 user_table_config = self.create_user_table_config() 339 # connect user id to sha 340 user_sha_ids = pd.DataFrame( 341 list(enumerate(self.user_list)), 342 columns=['user_id', 'user_sha'] 343 ) 344 345 # load original user table and when available, add it to user id/sha 346 user_table = self.load_user_table() 347 if user_table is None: 348 user_table = user_sha_ids 349 else: 350 for i in range(1, len(user_table.columns)): 351 user_table_config.columns += [user_table.columns[i]] 352 353 # join user table with user ids 354 user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha') 355 356 # fill unknown user age with -1 and cast back to int 357 user_table['user_age'].fillna(-1.0, inplace=True) 358 user_table['user_age'] = user_table['user_age'].astype(int) 359 360 # store the generated user table 361 user_table_config.save_table(user_table, self.dataset_dir) 362 363 return user_table_config
23class DatasetProcessorLFM360K(DatasetProcessorLFM): 24 """DatasetProcessor for the LastFM-360K dataset. 25 26 The dataset can be downloaded from the website below. 27 https://www.upf.edu/web/mtg/lastfm360k 28 29 The enriched artist gender information can be retrieved from: 30 https://zenodo.org/record/3748787#.YowEBqhByUk 31 32 The processor handles the following files: 33 34 usersha1-artmbid-artname-plays.tsv (required) 35 usersha1-profile.tsv (optional) 36 lfm-360-gender.json (optional) 37 """ 38 39 def __init__(self, dataset_dir: str, dataset_name: str): 40 """Construct the DatasetProcessorLFM360K. 41 42 Args: 43 dataset_name: path of the dataset directory. 44 dataset_name: name of the dataset (processor). 45 """ 46 DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name) 47 # buffer for the user sha and artist name lists 48 self.user_list = None 49 self.artist_list = None 50 # buffer for the artist name/musicbrainzID dataframe 51 self.artist_mb_id = None 52 53 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 54 """Create the listening event table configuration. 55 56 No listening events are available for this dataset. 57 58 Returns: 59 None. 60 """ 61 return None 62 63 def create_user_table_config(self) -> DatasetTableConfig: 64 """Create the user table configuration. 65 66 The base user configuration that contains the generated user ids 67 and corresponding user sha. 68 69 Returns: 70 the configuration of the user table. 71 """ 72 return create_dataset_table_config( 73 TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2', 74 ['user_id'], 75 ['user_sha'], 76 compression='bz2', 77 num_records=len(self.user_list) 78 ) 79 80 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 81 """Get matrix configuration processors. 82 83 Returns: 84 a list containing the user-artist-count matrix processor. 85 """ 86 return [('user-artist-count', self.process_user_artist_matrix)] 87 88 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 89 """Get table configuration processors. 90 91 Returns: 92 a list containing the artist and user table processors. 93 """ 94 return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)] 95 96 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 97 """Load the artist gender json file. 98 99 Returns: 100 the loaded artist musicbrainzID/gender table or None on failure. 101 """ 102 try: 103 gender_table = pd.read_json( 104 os.path.join(self.dataset_dir, 'lfm-360-gender.json'), 105 orient='index' 106 ) 107 gender_table.reset_index(inplace=True) 108 gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True) 109 return gender_table 110 except FileNotFoundError: 111 return None 112 113 def load_user_table(self) -> Optional[pd.DataFrame]: 114 """Load the original user table. 115 116 Changes the contents of the age and gender columns to be more user-friendly, 117 and the contents of the country column to ISO 3166 Alpha-2 country codes. 118 119 Returns: 120 the loaded user table on None on failure. 121 """ 122 user_table_columns = [ 123 'user_sha', 124 'user_gender', 125 'user_age', 126 'user_country', 127 'user_signup' 128 ] 129 130 try: 131 # load original user table 132 user_table = pd.read_table( 133 os.path.join(self.dataset_dir, 'usersha1-profile.tsv'), 134 names=user_table_columns, 135 sep='\t' 136 ) 137 except FileNotFoundError: 138 return None 139 140 # mask user age not between 1-100 as NaN 141 user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True) 142 user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True) 143 144 # convert gender to more user-friendly names 145 user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True) 146 147 # convert country to ISO 3166 Alpha-2 country code 148 user_table['user_country'].replace({ 149 'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS', 150 'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ', 151 'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW', 152 'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ', 153 'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB', 154 'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM', 155 'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW', 156 'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO', 157 'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI', 158 'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV', 159 'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL', 160 'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC', 161 'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ', 162 'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY', 163 'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU', 164 'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO', 165 'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ', 166 'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET', 167 'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ', 168 'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF', 169 'French Southern Territories': 'TF', 170 'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH', 171 'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL', 172 'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW', 173 'Guyana': 'GY', 174 'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU', 175 'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK', 176 'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR', 177 'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT', 178 'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO', 179 'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG', 180 'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW', 181 'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB', 182 'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY', 183 'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU', 184 'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT', 185 'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR', 186 'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT', 187 'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC', 188 'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat': 'MS', 'Morocco': 'MA', 189 'Mozambique': 'MZ', 'Myanmar': 'MM', 190 'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP', 191 'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC', 192 'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU', 193 'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO', 194 'Oman': 'OM', 195 'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA', 196 'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH', 197 'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR', 198 'Qatar': 'QA', 199 'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW', 200 'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC', 201 'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC', 202 'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA', 203 'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL', 204 'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA', 205 'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS', 206 'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR', 207 'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ', 208 'Sweden': 'SE', 'Switzerland': 'CH', 209 'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ', 210 'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO', 211 'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM', 212 'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV', 213 'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE', 214 'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM', 215 'Uruguay': 'UY', 'Uzbekistan': 'UZ', 216 'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG', 217 'Virgin Islands, U.s.': 'VI', 218 'Yemen': 'YE', 219 'Wallis and Futuna': 'WF', 'Western Sahara': 'EH', 220 'Zambia': 'ZM', 'Zimbabwe': 'ZW' 221 }, inplace=True) 222 223 return user_table 224 225 def process_artist_table(self) -> Optional[DatasetTableConfig]: 226 """Process the artist table. 227 228 Creates the artist table with the musicbrainzID and gender information when available. 229 230 Returns: 231 the artist table configuration or None on failure. 232 """ 233 artist_key = ['artist_id'] 234 artist_columns = ['artist_name'] 235 236 # connect artist id to name 237 artist_table = pd.DataFrame( 238 list(enumerate(self.artist_list)), 239 columns=artist_key + artist_columns 240 ) 241 242 # merge the artist musicbrainzID on name 243 artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name') 244 artist_table['artist_mbID'].fillna(-1, inplace=True) 245 artist_columns += ['artist_mbID'] 246 247 artist_gender = self.load_artist_gender_json() 248 if artist_gender is not None: 249 # merge artists with gender and update columns 250 artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID') 251 artist_columns += ['artist_gender'] 252 253 # create artist table configuration 254 artist_table_config = create_dataset_table_config( 255 TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2', 256 artist_key, 257 artist_columns, 258 compression='bz2', 259 num_records=len(self.artist_list) 260 ) 261 262 # store the generated artist table 263 artist_table_config.save_table(artist_table, self.dataset_dir) 264 265 return artist_table_config 266 267 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 268 """Process the user-artist-count matrix. 269 270 The user-item matrix is stored in a file that also contains a musicbrainzID. 271 The users are hashes and the items are names, both are converted to integers 272 to comply to the CSR compatible format. In addition, any rows that contain 273 corrupt data are removed in the process. 274 275 Returns: 276 the matrix configuration or None on failure. 277 """ 278 try: 279 dataframe = pd.read_table( 280 os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'), 281 names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count'] 282 ) 283 except FileNotFoundError: 284 return None 285 286 # remove rows from a user that is not a hash 287 dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008'] 288 289 # map users/items to category and ratings to be floating-point 290 dataframe['user_sha'] = dataframe['user_sha'].astype("category") 291 dataframe['artist_name'] = dataframe['artist_name'].astype("category") 292 dataframe['matrix_count'] = dataframe['matrix_count'].astype(float) 293 294 # remove rows that contain items that failed to map to category 295 dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0] 296 # remove rows that have unusable ratings 297 dataframe = dataframe[dataframe['matrix_count'] > 0] 298 299 dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True) 300 301 # extract user/item indirection arrays 302 self.user_list = list(dataframe['user_sha'].cat.categories) 303 self.artist_list = list(dataframe['artist_name'].cat.categories) 304 305 # extract artist name/musicbrainzID combinations 306 self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']] 307 # remove duplicates combinations 308 self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna() 309 # remove duplicates where the artist has more than one musicbrainzID 310 self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name') 311 312 # add the correct user/item integers 313 dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy() 314 dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy() 315 316 # create matrix by removing other columns 317 user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']] 318 user_artist_matrix_table_config = create_dataset_table_config( 319 TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2', 320 ['user_id', 'artist_id'], 321 ['matrix_count'], 322 compression='bz2', 323 foreign_keys=['user_id', 'artist_id'] 324 ) 325 326 # store the resulting matrix 327 user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir) 328 329 return self.process_matrix(user_artist_matrix_table_config) 330 331 def process_user_table(self) -> Optional[DatasetTableConfig]: 332 """Process the user table. 333 334 Extends the original user table with unique user ids. 335 336 Returns: 337 the user table configuration or None on failure. 338 """ 339 user_table_config = self.create_user_table_config() 340 # connect user id to sha 341 user_sha_ids = pd.DataFrame( 342 list(enumerate(self.user_list)), 343 columns=['user_id', 'user_sha'] 344 ) 345 346 # load original user table and when available, add it to user id/sha 347 user_table = self.load_user_table() 348 if user_table is None: 349 user_table = user_sha_ids 350 else: 351 for i in range(1, len(user_table.columns)): 352 user_table_config.columns += [user_table.columns[i]] 353 354 # join user table with user ids 355 user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha') 356 357 # fill unknown user age with -1 and cast back to int 358 user_table['user_age'].fillna(-1.0, inplace=True) 359 user_table['user_age'] = user_table['user_age'].astype(int) 360 361 # store the generated user table 362 user_table_config.save_table(user_table, self.dataset_dir) 363 364 return user_table_config
DatasetProcessor for the LastFM-360K dataset.
The dataset can be downloaded from the website below. https://www.upf.edu/web/mtg/lastfm360k
The enriched artist gender information can be retrieved from: https://zenodo.org/record/3748787#.YowEBqhByUk
The processor handles the following files:
usersha1-artmbid-artname-plays.tsv (required) usersha1-profile.tsv (optional) lfm-360-gender.json (optional)
39 def __init__(self, dataset_dir: str, dataset_name: str): 40 """Construct the DatasetProcessorLFM360K. 41 42 Args: 43 dataset_name: path of the dataset directory. 44 dataset_name: name of the dataset (processor). 45 """ 46 DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name) 47 # buffer for the user sha and artist name lists 48 self.user_list = None 49 self.artist_list = None 50 # buffer for the artist name/musicbrainzID dataframe 51 self.artist_mb_id = None
Construct the DatasetProcessorLFM360K.
Args: dataset_name: path of the dataset directory. dataset_name: name of the dataset (processor).
53 def create_listening_events_config(self) -> Optional[DatasetTableConfig]: 54 """Create the listening event table configuration. 55 56 No listening events are available for this dataset. 57 58 Returns: 59 None. 60 """ 61 return None
Create the listening event table configuration.
No listening events are available for this dataset.
Returns: None.
63 def create_user_table_config(self) -> DatasetTableConfig: 64 """Create the user table configuration. 65 66 The base user configuration that contains the generated user ids 67 and corresponding user sha. 68 69 Returns: 70 the configuration of the user table. 71 """ 72 return create_dataset_table_config( 73 TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2', 74 ['user_id'], 75 ['user_sha'], 76 compression='bz2', 77 num_records=len(self.user_list) 78 )
Create the user table configuration.
The base user configuration that contains the generated user ids and corresponding user sha.
Returns: the configuration of the user table.
80 def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]: 81 """Get matrix configuration processors. 82 83 Returns: 84 a list containing the user-artist-count matrix processor. 85 """ 86 return [('user-artist-count', self.process_user_artist_matrix)]
Get matrix configuration processors.
Returns: a list containing the user-artist-count matrix processor.
88 def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]: 89 """Get table configuration processors. 90 91 Returns: 92 a list containing the artist and user table processors. 93 """ 94 return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)]
Get table configuration processors.
Returns: a list containing the artist and user table processors.
96 def load_artist_gender_json(self) -> Optional[pd.DataFrame]: 97 """Load the artist gender json file. 98 99 Returns: 100 the loaded artist musicbrainzID/gender table or None on failure. 101 """ 102 try: 103 gender_table = pd.read_json( 104 os.path.join(self.dataset_dir, 'lfm-360-gender.json'), 105 orient='index' 106 ) 107 gender_table.reset_index(inplace=True) 108 gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True) 109 return gender_table 110 except FileNotFoundError: 111 return None
Load the artist gender json file.
Returns: the loaded artist musicbrainzID/gender table or None on failure.
113 def load_user_table(self) -> Optional[pd.DataFrame]: 114 """Load the original user table. 115 116 Changes the contents of the age and gender columns to be more user-friendly, 117 and the contents of the country column to ISO 3166 Alpha-2 country codes. 118 119 Returns: 120 the loaded user table on None on failure. 121 """ 122 user_table_columns = [ 123 'user_sha', 124 'user_gender', 125 'user_age', 126 'user_country', 127 'user_signup' 128 ] 129 130 try: 131 # load original user table 132 user_table = pd.read_table( 133 os.path.join(self.dataset_dir, 'usersha1-profile.tsv'), 134 names=user_table_columns, 135 sep='\t' 136 ) 137 except FileNotFoundError: 138 return None 139 140 # mask user age not between 1-100 as NaN 141 user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True) 142 user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True) 143 144 # convert gender to more user-friendly names 145 user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True) 146 147 # convert country to ISO 3166 Alpha-2 country code 148 user_table['user_country'].replace({ 149 'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS', 150 'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ', 151 'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW', 152 'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ', 153 'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB', 154 'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM', 155 'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW', 156 'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO', 157 'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI', 158 'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV', 159 'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL', 160 'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC', 161 'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ', 162 'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY', 163 'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU', 164 'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO', 165 'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ', 166 'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET', 167 'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ', 168 'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF', 169 'French Southern Territories': 'TF', 170 'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH', 171 'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL', 172 'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW', 173 'Guyana': 'GY', 174 'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU', 175 'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK', 176 'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR', 177 'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT', 178 'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO', 179 'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG', 180 'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW', 181 'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB', 182 'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY', 183 'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU', 184 'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT', 185 'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR', 186 'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT', 187 'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC', 188 'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat': 'MS', 'Morocco': 'MA', 189 'Mozambique': 'MZ', 'Myanmar': 'MM', 190 'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP', 191 'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC', 192 'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU', 193 'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO', 194 'Oman': 'OM', 195 'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA', 196 'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH', 197 'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR', 198 'Qatar': 'QA', 199 'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW', 200 'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC', 201 'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC', 202 'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA', 203 'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL', 204 'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA', 205 'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS', 206 'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR', 207 'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ', 208 'Sweden': 'SE', 'Switzerland': 'CH', 209 'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ', 210 'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO', 211 'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM', 212 'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV', 213 'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE', 214 'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM', 215 'Uruguay': 'UY', 'Uzbekistan': 'UZ', 216 'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG', 217 'Virgin Islands, U.s.': 'VI', 218 'Yemen': 'YE', 219 'Wallis and Futuna': 'WF', 'Western Sahara': 'EH', 220 'Zambia': 'ZM', 'Zimbabwe': 'ZW' 221 }, inplace=True) 222 223 return user_table
Load the original user table.
Changes the contents of the age and gender columns to be more user-friendly, and the contents of the country column to ISO 3166 Alpha-2 country codes.
Returns: the loaded user table on None on failure.
225 def process_artist_table(self) -> Optional[DatasetTableConfig]: 226 """Process the artist table. 227 228 Creates the artist table with the musicbrainzID and gender information when available. 229 230 Returns: 231 the artist table configuration or None on failure. 232 """ 233 artist_key = ['artist_id'] 234 artist_columns = ['artist_name'] 235 236 # connect artist id to name 237 artist_table = pd.DataFrame( 238 list(enumerate(self.artist_list)), 239 columns=artist_key + artist_columns 240 ) 241 242 # merge the artist musicbrainzID on name 243 artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name') 244 artist_table['artist_mbID'].fillna(-1, inplace=True) 245 artist_columns += ['artist_mbID'] 246 247 artist_gender = self.load_artist_gender_json() 248 if artist_gender is not None: 249 # merge artists with gender and update columns 250 artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID') 251 artist_columns += ['artist_gender'] 252 253 # create artist table configuration 254 artist_table_config = create_dataset_table_config( 255 TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2', 256 artist_key, 257 artist_columns, 258 compression='bz2', 259 num_records=len(self.artist_list) 260 ) 261 262 # store the generated artist table 263 artist_table_config.save_table(artist_table, self.dataset_dir) 264 265 return artist_table_config
Process the artist table.
Creates the artist table with the musicbrainzID and gender information when available.
Returns: the artist table configuration or None on failure.
267 def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]: 268 """Process the user-artist-count matrix. 269 270 The user-item matrix is stored in a file that also contains a musicbrainzID. 271 The users are hashes and the items are names, both are converted to integers 272 to comply to the CSR compatible format. In addition, any rows that contain 273 corrupt data are removed in the process. 274 275 Returns: 276 the matrix configuration or None on failure. 277 """ 278 try: 279 dataframe = pd.read_table( 280 os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'), 281 names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count'] 282 ) 283 except FileNotFoundError: 284 return None 285 286 # remove rows from a user that is not a hash 287 dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008'] 288 289 # map users/items to category and ratings to be floating-point 290 dataframe['user_sha'] = dataframe['user_sha'].astype("category") 291 dataframe['artist_name'] = dataframe['artist_name'].astype("category") 292 dataframe['matrix_count'] = dataframe['matrix_count'].astype(float) 293 294 # remove rows that contain items that failed to map to category 295 dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0] 296 # remove rows that have unusable ratings 297 dataframe = dataframe[dataframe['matrix_count'] > 0] 298 299 dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True) 300 301 # extract user/item indirection arrays 302 self.user_list = list(dataframe['user_sha'].cat.categories) 303 self.artist_list = list(dataframe['artist_name'].cat.categories) 304 305 # extract artist name/musicbrainzID combinations 306 self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']] 307 # remove duplicates combinations 308 self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna() 309 # remove duplicates where the artist has more than one musicbrainzID 310 self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name') 311 312 # add the correct user/item integers 313 dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy() 314 dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy() 315 316 # create matrix by removing other columns 317 user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']] 318 user_artist_matrix_table_config = create_dataset_table_config( 319 TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2', 320 ['user_id', 'artist_id'], 321 ['matrix_count'], 322 compression='bz2', 323 foreign_keys=['user_id', 'artist_id'] 324 ) 325 326 # store the resulting matrix 327 user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir) 328 329 return self.process_matrix(user_artist_matrix_table_config)
Process the user-artist-count matrix.
The user-item matrix is stored in a file that also contains a musicbrainzID. The users are hashes and the items are names, both are converted to integers to comply to the CSR compatible format. In addition, any rows that contain corrupt data are removed in the process.
Returns: the matrix configuration or None on failure.
331 def process_user_table(self) -> Optional[DatasetTableConfig]: 332 """Process the user table. 333 334 Extends the original user table with unique user ids. 335 336 Returns: 337 the user table configuration or None on failure. 338 """ 339 user_table_config = self.create_user_table_config() 340 # connect user id to sha 341 user_sha_ids = pd.DataFrame( 342 list(enumerate(self.user_list)), 343 columns=['user_id', 'user_sha'] 344 ) 345 346 # load original user table and when available, add it to user id/sha 347 user_table = self.load_user_table() 348 if user_table is None: 349 user_table = user_sha_ids 350 else: 351 for i in range(1, len(user_table.columns)): 352 user_table_config.columns += [user_table.columns[i]] 353 354 # join user table with user ids 355 user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha') 356 357 # fill unknown user age with -1 and cast back to int 358 user_table['user_age'].fillna(-1.0, inplace=True) 359 user_table['user_age'] = user_table['user_age'].astype(int) 360 361 # store the generated user table 362 user_table_config.save_table(user_table, self.dataset_dir) 363 364 return user_table_config
Process the user table.
Extends the original user table with unique user ids.
Returns: the user table configuration or None on failure.