src.fairreckitlib.data.set.processor.dataset_processor_lfm360k

This modules contains the class to process the LastFM-360K dataset.

Classes:

DatasetProcessorLFM360K: data processor implementation for the LFM-360K dataset.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This modules contains the class to process the LastFM-360K dataset.
  2
  3Classes:
  4
  5    DatasetProcessorLFM360K: data processor implementation for the LFM-360K dataset.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12import os
 13from typing import Callable, List, Optional, Tuple
 14
 15import pandas as pd
 16
 17from ..dataset_config import create_dataset_table_config, DatasetMatrixConfig, DatasetTableConfig
 18from ..dataset_constants import TABLE_FILE_PREFIX
 19from .dataset_processor_lfm import DatasetProcessorLFM
 20
 21
 22class DatasetProcessorLFM360K(DatasetProcessorLFM):
 23    """DatasetProcessor for the LastFM-360K dataset.
 24
 25    The dataset can be downloaded from the website below.
 26    https://www.upf.edu/web/mtg/lastfm360k
 27
 28    The enriched artist gender information can be retrieved from:
 29    https://zenodo.org/record/3748787#.YowEBqhByUk
 30
 31    The processor handles the following files:
 32
 33    usersha1-artmbid-artname-plays.tsv (required)
 34    usersha1-profile.tsv (optional)
 35    lfm-360-gender.json (optional)
 36    """
 37
 38    def __init__(self, dataset_dir: str, dataset_name: str):
 39        """Construct the DatasetProcessorLFM360K.
 40
 41        Args:
 42            dataset_name: path of the dataset directory.
 43            dataset_name: name of the dataset (processor).
 44        """
 45        DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name)
 46        # buffer for the user sha and artist name lists
 47        self.user_list = None
 48        self.artist_list = None
 49        # buffer for the artist name/musicbrainzID dataframe
 50        self.artist_mb_id = None
 51
 52    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
 53        """Create the listening event table configuration.
 54
 55        No listening events are available for this dataset.
 56
 57        Returns:
 58            None.
 59        """
 60        return None
 61
 62    def create_user_table_config(self) -> DatasetTableConfig:
 63        """Create the user table configuration.
 64
 65        The base user configuration that contains the generated user ids
 66        and corresponding user sha.
 67
 68        Returns:
 69            the configuration of the user table.
 70        """
 71        return create_dataset_table_config(
 72            TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2',
 73            ['user_id'],
 74            ['user_sha'],
 75            compression='bz2',
 76            num_records=len(self.user_list)
 77        )
 78
 79    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 80        """Get matrix configuration processors.
 81
 82        Returns:
 83            a list containing the user-artist-count matrix processor.
 84        """
 85        return [('user-artist-count', self.process_user_artist_matrix)]
 86
 87    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 88        """Get table configuration processors.
 89
 90        Returns:
 91            a list containing the artist and user table processors.
 92        """
 93        return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)]
 94
 95    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
 96        """Load the artist gender json file.
 97
 98        Returns:
 99            the loaded artist musicbrainzID/gender table or None on failure.
100        """
101        try:
102            gender_table = pd.read_json(
103                os.path.join(self.dataset_dir, 'lfm-360-gender.json'),
104                orient='index'
105            )
106            gender_table.reset_index(inplace=True)
107            gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True)
108            return gender_table
109        except FileNotFoundError:
110            return None
111
112    def load_user_table(self) -> Optional[pd.DataFrame]:
113        """Load the original user table.
114
115        Changes the contents of the age and gender columns to be more user-friendly,
116        and the contents of the country column to ISO 3166 Alpha-2 country codes.
117
118        Returns:
119            the loaded user table on None on failure.
120        """
121        user_table_columns = [
122            'user_sha',
123            'user_gender',
124            'user_age',
125            'user_country',
126            'user_signup'
127        ]
128
129        try:
130            # load original user table
131            user_table = pd.read_table(
132                os.path.join(self.dataset_dir, 'usersha1-profile.tsv'),
133                names=user_table_columns,
134                sep='\t'
135            )
136        except FileNotFoundError:
137            return None
138
139        # mask user age not between 1-100 as NaN
140        user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True)
141        user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True)
142
143        # convert gender to more user-friendly names
144        user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True)
145
146        # convert country to ISO 3166 Alpha-2 country code
147        user_table['user_country'].replace({
148            'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS',
149            'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ',
150            'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW',
151            'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ',
152            'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB',
153            'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM',
154            'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW',
155            'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO',
156            'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI',
157            'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV',
158            'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL',
159            'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC',
160            'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ',
161            'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY',
162            'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU',
163            'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO',
164            'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ',
165            'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET',
166            'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ',
167            'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF',
168            'French Southern Territories': 'TF',
169            'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH',
170            'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL',
171            'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW',
172            'Guyana': 'GY',
173            'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU',
174            'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK',
175            'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR',
176            'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT',
177            'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO',
178            'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG',
179            'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW',
180            'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB',
181            'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY',
182            'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU',
183            'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT',
184            'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR',
185            'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT',
186            'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC',
187            'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat':  'MS', 'Morocco': 'MA',
188            'Mozambique': 'MZ', 'Myanmar': 'MM',
189            'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP',
190            'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC',
191            'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU',
192            'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO',
193            'Oman': 'OM',
194            'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA',
195            'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH',
196            'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR',
197            'Qatar': 'QA',
198            'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW',
199            'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC',
200            'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC',
201            'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA',
202            'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL',
203            'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA',
204            'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS',
205            'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR',
206            'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ',
207            'Sweden': 'SE', 'Switzerland': 'CH',
208            'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ',
209            'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO',
210            'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM',
211            'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV',
212            'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE',
213            'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM',
214            'Uruguay': 'UY', 'Uzbekistan': 'UZ',
215            'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG',
216            'Virgin Islands, U.s.': 'VI',
217            'Yemen': 'YE',
218            'Wallis and Futuna': 'WF', 'Western Sahara': 'EH',
219            'Zambia': 'ZM', 'Zimbabwe': 'ZW'
220        }, inplace=True)
221
222        return user_table
223
224    def process_artist_table(self) -> Optional[DatasetTableConfig]:
225        """Process the artist table.
226
227        Creates the artist table with the musicbrainzID and gender information when available.
228
229        Returns:
230            the artist table configuration or None on failure.
231        """
232        artist_key = ['artist_id']
233        artist_columns = ['artist_name']
234
235        # connect artist id to name
236        artist_table = pd.DataFrame(
237            list(enumerate(self.artist_list)),
238            columns=artist_key + artist_columns
239        )
240
241        # merge the artist musicbrainzID on name
242        artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name')
243        artist_table['artist_mbID'].fillna(-1, inplace=True)
244        artist_columns += ['artist_mbID']
245
246        artist_gender = self.load_artist_gender_json()
247        if artist_gender is not None:
248            # merge artists with gender and update columns
249            artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID')
250            artist_columns += ['artist_gender']
251
252        # create artist table configuration
253        artist_table_config = create_dataset_table_config(
254            TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2',
255            artist_key,
256            artist_columns,
257            compression='bz2',
258            num_records=len(self.artist_list)
259        )
260
261        # store the generated artist table
262        artist_table_config.save_table(artist_table, self.dataset_dir)
263
264        return artist_table_config
265
266    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
267        """Process the user-artist-count matrix.
268
269        The user-item matrix is stored in a file that also contains a musicbrainzID.
270        The users are hashes and the items are names, both are converted to integers
271        to comply to the CSR compatible format. In addition, any rows that contain
272        corrupt data are removed in the process.
273
274        Returns:
275            the matrix configuration or None on failure.
276        """
277        try:
278            dataframe = pd.read_table(
279                os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'),
280                names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count']
281            )
282        except FileNotFoundError:
283            return None
284
285        # remove rows from a user that is not a hash
286        dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008']
287
288        # map users/items to category and ratings to be floating-point
289        dataframe['user_sha'] = dataframe['user_sha'].astype("category")
290        dataframe['artist_name'] = dataframe['artist_name'].astype("category")
291        dataframe['matrix_count'] = dataframe['matrix_count'].astype(float)
292
293        # remove rows that contain items that failed to map to category
294        dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0]
295        # remove rows that have unusable ratings
296        dataframe = dataframe[dataframe['matrix_count'] > 0]
297
298        dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True)
299
300        # extract user/item indirection arrays
301        self.user_list = list(dataframe['user_sha'].cat.categories)
302        self.artist_list = list(dataframe['artist_name'].cat.categories)
303
304        # extract artist name/musicbrainzID combinations
305        self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']]
306        # remove duplicates combinations
307        self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna()
308        # remove duplicates where the artist has more than one musicbrainzID
309        self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name')
310
311        # add the correct user/item integers
312        dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy()
313        dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy()
314
315        # create matrix by removing other columns
316        user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']]
317        user_artist_matrix_table_config = create_dataset_table_config(
318            TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2',
319            ['user_id', 'artist_id'],
320            ['matrix_count'],
321            compression='bz2',
322            foreign_keys=['user_id', 'artist_id']
323        )
324
325        # store the resulting matrix
326        user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir)
327
328        return self.process_matrix(user_artist_matrix_table_config)
329
330    def process_user_table(self) -> Optional[DatasetTableConfig]:
331        """Process the user table.
332
333        Extends the original user table with unique user ids.
334
335        Returns:
336            the user table configuration or None on failure.
337        """
338        user_table_config = self.create_user_table_config()
339        # connect user id to sha
340        user_sha_ids = pd.DataFrame(
341            list(enumerate(self.user_list)),
342            columns=['user_id', 'user_sha']
343        )
344
345        # load original user table and when available, add it to user id/sha
346        user_table = self.load_user_table()
347        if user_table is None:
348            user_table = user_sha_ids
349        else:
350            for i in range(1, len(user_table.columns)):
351                user_table_config.columns += [user_table.columns[i]]
352
353            # join user table with user ids
354            user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha')
355
356            # fill unknown user age with -1 and cast back to int
357            user_table['user_age'].fillna(-1.0, inplace=True)
358            user_table['user_age'] = user_table['user_age'].astype(int)
359
360        # store the generated user table
361        user_table_config.save_table(user_table, self.dataset_dir)
362
363        return user_table_config
 23class DatasetProcessorLFM360K(DatasetProcessorLFM):
 24    """DatasetProcessor for the LastFM-360K dataset.
 25
 26    The dataset can be downloaded from the website below.
 27    https://www.upf.edu/web/mtg/lastfm360k
 28
 29    The enriched artist gender information can be retrieved from:
 30    https://zenodo.org/record/3748787#.YowEBqhByUk
 31
 32    The processor handles the following files:
 33
 34    usersha1-artmbid-artname-plays.tsv (required)
 35    usersha1-profile.tsv (optional)
 36    lfm-360-gender.json (optional)
 37    """
 38
 39    def __init__(self, dataset_dir: str, dataset_name: str):
 40        """Construct the DatasetProcessorLFM360K.
 41
 42        Args:
 43            dataset_name: path of the dataset directory.
 44            dataset_name: name of the dataset (processor).
 45        """
 46        DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name)
 47        # buffer for the user sha and artist name lists
 48        self.user_list = None
 49        self.artist_list = None
 50        # buffer for the artist name/musicbrainzID dataframe
 51        self.artist_mb_id = None
 52
 53    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
 54        """Create the listening event table configuration.
 55
 56        No listening events are available for this dataset.
 57
 58        Returns:
 59            None.
 60        """
 61        return None
 62
 63    def create_user_table_config(self) -> DatasetTableConfig:
 64        """Create the user table configuration.
 65
 66        The base user configuration that contains the generated user ids
 67        and corresponding user sha.
 68
 69        Returns:
 70            the configuration of the user table.
 71        """
 72        return create_dataset_table_config(
 73            TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2',
 74            ['user_id'],
 75            ['user_sha'],
 76            compression='bz2',
 77            num_records=len(self.user_list)
 78        )
 79
 80    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 81        """Get matrix configuration processors.
 82
 83        Returns:
 84            a list containing the user-artist-count matrix processor.
 85        """
 86        return [('user-artist-count', self.process_user_artist_matrix)]
 87
 88    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 89        """Get table configuration processors.
 90
 91        Returns:
 92            a list containing the artist and user table processors.
 93        """
 94        return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)]
 95
 96    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
 97        """Load the artist gender json file.
 98
 99        Returns:
100            the loaded artist musicbrainzID/gender table or None on failure.
101        """
102        try:
103            gender_table = pd.read_json(
104                os.path.join(self.dataset_dir, 'lfm-360-gender.json'),
105                orient='index'
106            )
107            gender_table.reset_index(inplace=True)
108            gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True)
109            return gender_table
110        except FileNotFoundError:
111            return None
112
113    def load_user_table(self) -> Optional[pd.DataFrame]:
114        """Load the original user table.
115
116        Changes the contents of the age and gender columns to be more user-friendly,
117        and the contents of the country column to ISO 3166 Alpha-2 country codes.
118
119        Returns:
120            the loaded user table on None on failure.
121        """
122        user_table_columns = [
123            'user_sha',
124            'user_gender',
125            'user_age',
126            'user_country',
127            'user_signup'
128        ]
129
130        try:
131            # load original user table
132            user_table = pd.read_table(
133                os.path.join(self.dataset_dir, 'usersha1-profile.tsv'),
134                names=user_table_columns,
135                sep='\t'
136            )
137        except FileNotFoundError:
138            return None
139
140        # mask user age not between 1-100 as NaN
141        user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True)
142        user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True)
143
144        # convert gender to more user-friendly names
145        user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True)
146
147        # convert country to ISO 3166 Alpha-2 country code
148        user_table['user_country'].replace({
149            'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS',
150            'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ',
151            'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW',
152            'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ',
153            'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB',
154            'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM',
155            'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW',
156            'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO',
157            'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI',
158            'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV',
159            'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL',
160            'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC',
161            'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ',
162            'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY',
163            'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU',
164            'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO',
165            'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ',
166            'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET',
167            'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ',
168            'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF',
169            'French Southern Territories': 'TF',
170            'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH',
171            'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL',
172            'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW',
173            'Guyana': 'GY',
174            'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU',
175            'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK',
176            'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR',
177            'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT',
178            'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO',
179            'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG',
180            'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW',
181            'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB',
182            'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY',
183            'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU',
184            'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT',
185            'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR',
186            'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT',
187            'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC',
188            'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat':  'MS', 'Morocco': 'MA',
189            'Mozambique': 'MZ', 'Myanmar': 'MM',
190            'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP',
191            'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC',
192            'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU',
193            'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO',
194            'Oman': 'OM',
195            'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA',
196            'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH',
197            'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR',
198            'Qatar': 'QA',
199            'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW',
200            'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC',
201            'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC',
202            'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA',
203            'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL',
204            'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA',
205            'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS',
206            'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR',
207            'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ',
208            'Sweden': 'SE', 'Switzerland': 'CH',
209            'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ',
210            'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO',
211            'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM',
212            'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV',
213            'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE',
214            'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM',
215            'Uruguay': 'UY', 'Uzbekistan': 'UZ',
216            'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG',
217            'Virgin Islands, U.s.': 'VI',
218            'Yemen': 'YE',
219            'Wallis and Futuna': 'WF', 'Western Sahara': 'EH',
220            'Zambia': 'ZM', 'Zimbabwe': 'ZW'
221        }, inplace=True)
222
223        return user_table
224
225    def process_artist_table(self) -> Optional[DatasetTableConfig]:
226        """Process the artist table.
227
228        Creates the artist table with the musicbrainzID and gender information when available.
229
230        Returns:
231            the artist table configuration or None on failure.
232        """
233        artist_key = ['artist_id']
234        artist_columns = ['artist_name']
235
236        # connect artist id to name
237        artist_table = pd.DataFrame(
238            list(enumerate(self.artist_list)),
239            columns=artist_key + artist_columns
240        )
241
242        # merge the artist musicbrainzID on name
243        artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name')
244        artist_table['artist_mbID'].fillna(-1, inplace=True)
245        artist_columns += ['artist_mbID']
246
247        artist_gender = self.load_artist_gender_json()
248        if artist_gender is not None:
249            # merge artists with gender and update columns
250            artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID')
251            artist_columns += ['artist_gender']
252
253        # create artist table configuration
254        artist_table_config = create_dataset_table_config(
255            TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2',
256            artist_key,
257            artist_columns,
258            compression='bz2',
259            num_records=len(self.artist_list)
260        )
261
262        # store the generated artist table
263        artist_table_config.save_table(artist_table, self.dataset_dir)
264
265        return artist_table_config
266
267    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
268        """Process the user-artist-count matrix.
269
270        The user-item matrix is stored in a file that also contains a musicbrainzID.
271        The users are hashes and the items are names, both are converted to integers
272        to comply to the CSR compatible format. In addition, any rows that contain
273        corrupt data are removed in the process.
274
275        Returns:
276            the matrix configuration or None on failure.
277        """
278        try:
279            dataframe = pd.read_table(
280                os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'),
281                names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count']
282            )
283        except FileNotFoundError:
284            return None
285
286        # remove rows from a user that is not a hash
287        dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008']
288
289        # map users/items to category and ratings to be floating-point
290        dataframe['user_sha'] = dataframe['user_sha'].astype("category")
291        dataframe['artist_name'] = dataframe['artist_name'].astype("category")
292        dataframe['matrix_count'] = dataframe['matrix_count'].astype(float)
293
294        # remove rows that contain items that failed to map to category
295        dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0]
296        # remove rows that have unusable ratings
297        dataframe = dataframe[dataframe['matrix_count'] > 0]
298
299        dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True)
300
301        # extract user/item indirection arrays
302        self.user_list = list(dataframe['user_sha'].cat.categories)
303        self.artist_list = list(dataframe['artist_name'].cat.categories)
304
305        # extract artist name/musicbrainzID combinations
306        self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']]
307        # remove duplicates combinations
308        self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna()
309        # remove duplicates where the artist has more than one musicbrainzID
310        self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name')
311
312        # add the correct user/item integers
313        dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy()
314        dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy()
315
316        # create matrix by removing other columns
317        user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']]
318        user_artist_matrix_table_config = create_dataset_table_config(
319            TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2',
320            ['user_id', 'artist_id'],
321            ['matrix_count'],
322            compression='bz2',
323            foreign_keys=['user_id', 'artist_id']
324        )
325
326        # store the resulting matrix
327        user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir)
328
329        return self.process_matrix(user_artist_matrix_table_config)
330
331    def process_user_table(self) -> Optional[DatasetTableConfig]:
332        """Process the user table.
333
334        Extends the original user table with unique user ids.
335
336        Returns:
337            the user table configuration or None on failure.
338        """
339        user_table_config = self.create_user_table_config()
340        # connect user id to sha
341        user_sha_ids = pd.DataFrame(
342            list(enumerate(self.user_list)),
343            columns=['user_id', 'user_sha']
344        )
345
346        # load original user table and when available, add it to user id/sha
347        user_table = self.load_user_table()
348        if user_table is None:
349            user_table = user_sha_ids
350        else:
351            for i in range(1, len(user_table.columns)):
352                user_table_config.columns += [user_table.columns[i]]
353
354            # join user table with user ids
355            user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha')
356
357            # fill unknown user age with -1 and cast back to int
358            user_table['user_age'].fillna(-1.0, inplace=True)
359            user_table['user_age'] = user_table['user_age'].astype(int)
360
361        # store the generated user table
362        user_table_config.save_table(user_table, self.dataset_dir)
363
364        return user_table_config

DatasetProcessor for the LastFM-360K dataset.

The dataset can be downloaded from the website below. https://www.upf.edu/web/mtg/lastfm360k

The enriched artist gender information can be retrieved from: https://zenodo.org/record/3748787#.YowEBqhByUk

The processor handles the following files:

usersha1-artmbid-artname-plays.tsv (required) usersha1-profile.tsv (optional) lfm-360-gender.json (optional)

DatasetProcessorLFM360K(dataset_dir: str, dataset_name: str)
39    def __init__(self, dataset_dir: str, dataset_name: str):
40        """Construct the DatasetProcessorLFM360K.
41
42        Args:
43            dataset_name: path of the dataset directory.
44            dataset_name: name of the dataset (processor).
45        """
46        DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name)
47        # buffer for the user sha and artist name lists
48        self.user_list = None
49        self.artist_list = None
50        # buffer for the artist name/musicbrainzID dataframe
51        self.artist_mb_id = None

Construct the DatasetProcessorLFM360K.

Args: dataset_name: path of the dataset directory. dataset_name: name of the dataset (processor).

def create_listening_events_config( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
53    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
54        """Create the listening event table configuration.
55
56        No listening events are available for this dataset.
57
58        Returns:
59            None.
60        """
61        return None

Create the listening event table configuration.

No listening events are available for this dataset.

Returns: None.

def create_user_table_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
63    def create_user_table_config(self) -> DatasetTableConfig:
64        """Create the user table configuration.
65
66        The base user configuration that contains the generated user ids
67        and corresponding user sha.
68
69        Returns:
70            the configuration of the user table.
71        """
72        return create_dataset_table_config(
73            TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2',
74            ['user_id'],
75            ['user_sha'],
76            compression='bz2',
77            num_records=len(self.user_list)
78        )

Create the user table configuration.

The base user configuration that contains the generated user ids and corresponding user sha.

Returns: the configuration of the user table.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]:
80    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
81        """Get matrix configuration processors.
82
83        Returns:
84            a list containing the user-artist-count matrix processor.
85        """
86        return [('user-artist-count', self.process_user_artist_matrix)]

Get matrix configuration processors.

Returns: a list containing the user-artist-count matrix processor.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
88    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
89        """Get table configuration processors.
90
91        Returns:
92            a list containing the artist and user table processors.
93        """
94        return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)]

Get table configuration processors.

Returns: a list containing the artist and user table processors.

def load_artist_gender_json(self) -> Optional[pandas.core.frame.DataFrame]:
 96    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
 97        """Load the artist gender json file.
 98
 99        Returns:
100            the loaded artist musicbrainzID/gender table or None on failure.
101        """
102        try:
103            gender_table = pd.read_json(
104                os.path.join(self.dataset_dir, 'lfm-360-gender.json'),
105                orient='index'
106            )
107            gender_table.reset_index(inplace=True)
108            gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True)
109            return gender_table
110        except FileNotFoundError:
111            return None

Load the artist gender json file.

Returns: the loaded artist musicbrainzID/gender table or None on failure.

def load_user_table(self) -> Optional[pandas.core.frame.DataFrame]:
113    def load_user_table(self) -> Optional[pd.DataFrame]:
114        """Load the original user table.
115
116        Changes the contents of the age and gender columns to be more user-friendly,
117        and the contents of the country column to ISO 3166 Alpha-2 country codes.
118
119        Returns:
120            the loaded user table on None on failure.
121        """
122        user_table_columns = [
123            'user_sha',
124            'user_gender',
125            'user_age',
126            'user_country',
127            'user_signup'
128        ]
129
130        try:
131            # load original user table
132            user_table = pd.read_table(
133                os.path.join(self.dataset_dir, 'usersha1-profile.tsv'),
134                names=user_table_columns,
135                sep='\t'
136            )
137        except FileNotFoundError:
138            return None
139
140        # mask user age not between 1-100 as NaN
141        user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True)
142        user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True)
143
144        # convert gender to more user-friendly names
145        user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True)
146
147        # convert country to ISO 3166 Alpha-2 country code
148        user_table['user_country'].replace({
149            'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS',
150            'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ',
151            'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW',
152            'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ',
153            'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB',
154            'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM',
155            'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW',
156            'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO',
157            'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI',
158            'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV',
159            'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL',
160            'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC',
161            'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ',
162            'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY',
163            'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU',
164            'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO',
165            'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ',
166            'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET',
167            'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ',
168            'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF',
169            'French Southern Territories': 'TF',
170            'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH',
171            'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL',
172            'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW',
173            'Guyana': 'GY',
174            'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU',
175            'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK',
176            'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR',
177            'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT',
178            'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO',
179            'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG',
180            'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW',
181            'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB',
182            'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY',
183            'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU',
184            'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT',
185            'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR',
186            'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT',
187            'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC',
188            'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat':  'MS', 'Morocco': 'MA',
189            'Mozambique': 'MZ', 'Myanmar': 'MM',
190            'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP',
191            'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC',
192            'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU',
193            'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO',
194            'Oman': 'OM',
195            'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA',
196            'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH',
197            'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR',
198            'Qatar': 'QA',
199            'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW',
200            'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC',
201            'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC',
202            'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA',
203            'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL',
204            'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA',
205            'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS',
206            'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR',
207            'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ',
208            'Sweden': 'SE', 'Switzerland': 'CH',
209            'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ',
210            'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO',
211            'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM',
212            'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV',
213            'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE',
214            'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM',
215            'Uruguay': 'UY', 'Uzbekistan': 'UZ',
216            'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG',
217            'Virgin Islands, U.s.': 'VI',
218            'Yemen': 'YE',
219            'Wallis and Futuna': 'WF', 'Western Sahara': 'EH',
220            'Zambia': 'ZM', 'Zimbabwe': 'ZW'
221        }, inplace=True)
222
223        return user_table

Load the original user table.

Changes the contents of the age and gender columns to be more user-friendly, and the contents of the country column to ISO 3166 Alpha-2 country codes.

Returns: the loaded user table on None on failure.

def process_artist_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
225    def process_artist_table(self) -> Optional[DatasetTableConfig]:
226        """Process the artist table.
227
228        Creates the artist table with the musicbrainzID and gender information when available.
229
230        Returns:
231            the artist table configuration or None on failure.
232        """
233        artist_key = ['artist_id']
234        artist_columns = ['artist_name']
235
236        # connect artist id to name
237        artist_table = pd.DataFrame(
238            list(enumerate(self.artist_list)),
239            columns=artist_key + artist_columns
240        )
241
242        # merge the artist musicbrainzID on name
243        artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name')
244        artist_table['artist_mbID'].fillna(-1, inplace=True)
245        artist_columns += ['artist_mbID']
246
247        artist_gender = self.load_artist_gender_json()
248        if artist_gender is not None:
249            # merge artists with gender and update columns
250            artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID')
251            artist_columns += ['artist_gender']
252
253        # create artist table configuration
254        artist_table_config = create_dataset_table_config(
255            TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2',
256            artist_key,
257            artist_columns,
258            compression='bz2',
259            num_records=len(self.artist_list)
260        )
261
262        # store the generated artist table
263        artist_table_config.save_table(artist_table, self.dataset_dir)
264
265        return artist_table_config

Process the artist table.

Creates the artist table with the musicbrainzID and gender information when available.

Returns: the artist table configuration or None on failure.

def process_user_artist_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]:
267    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
268        """Process the user-artist-count matrix.
269
270        The user-item matrix is stored in a file that also contains a musicbrainzID.
271        The users are hashes and the items are names, both are converted to integers
272        to comply to the CSR compatible format. In addition, any rows that contain
273        corrupt data are removed in the process.
274
275        Returns:
276            the matrix configuration or None on failure.
277        """
278        try:
279            dataframe = pd.read_table(
280                os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'),
281                names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count']
282            )
283        except FileNotFoundError:
284            return None
285
286        # remove rows from a user that is not a hash
287        dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008']
288
289        # map users/items to category and ratings to be floating-point
290        dataframe['user_sha'] = dataframe['user_sha'].astype("category")
291        dataframe['artist_name'] = dataframe['artist_name'].astype("category")
292        dataframe['matrix_count'] = dataframe['matrix_count'].astype(float)
293
294        # remove rows that contain items that failed to map to category
295        dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0]
296        # remove rows that have unusable ratings
297        dataframe = dataframe[dataframe['matrix_count'] > 0]
298
299        dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True)
300
301        # extract user/item indirection arrays
302        self.user_list = list(dataframe['user_sha'].cat.categories)
303        self.artist_list = list(dataframe['artist_name'].cat.categories)
304
305        # extract artist name/musicbrainzID combinations
306        self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']]
307        # remove duplicates combinations
308        self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna()
309        # remove duplicates where the artist has more than one musicbrainzID
310        self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name')
311
312        # add the correct user/item integers
313        dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy()
314        dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy()
315
316        # create matrix by removing other columns
317        user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']]
318        user_artist_matrix_table_config = create_dataset_table_config(
319            TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2',
320            ['user_id', 'artist_id'],
321            ['matrix_count'],
322            compression='bz2',
323            foreign_keys=['user_id', 'artist_id']
324        )
325
326        # store the resulting matrix
327        user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir)
328
329        return self.process_matrix(user_artist_matrix_table_config)

Process the user-artist-count matrix.

The user-item matrix is stored in a file that also contains a musicbrainzID. The users are hashes and the items are names, both are converted to integers to comply to the CSR compatible format. In addition, any rows that contain corrupt data are removed in the process.

Returns: the matrix configuration or None on failure.

def process_user_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
331    def process_user_table(self) -> Optional[DatasetTableConfig]:
332        """Process the user table.
333
334        Extends the original user table with unique user ids.
335
336        Returns:
337            the user table configuration or None on failure.
338        """
339        user_table_config = self.create_user_table_config()
340        # connect user id to sha
341        user_sha_ids = pd.DataFrame(
342            list(enumerate(self.user_list)),
343            columns=['user_id', 'user_sha']
344        )
345
346        # load original user table and when available, add it to user id/sha
347        user_table = self.load_user_table()
348        if user_table is None:
349            user_table = user_sha_ids
350        else:
351            for i in range(1, len(user_table.columns)):
352                user_table_config.columns += [user_table.columns[i]]
353
354            # join user table with user ids
355            user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha')
356
357            # fill unknown user age with -1 and cast back to int
358            user_table['user_age'].fillna(-1.0, inplace=True)
359            user_table['user_age'] = user_table['user_age'].astype(int)
360
361        # store the generated user table
362        user_table_config.save_table(user_table, self.dataset_dir)
363
364        return user_table_config

Process the user table.

Extends the original user table with unique user ids.

Returns: the user table configuration or None on failure.