src.fairreckitlib.data.set.processor.dataset_processor

DatasetProcessorLFM360K(dataset_dir: str, dataset_name: str) View Source

39    def __init__(self, dataset_dir: str, dataset_name: str):
40        """Construct the DatasetProcessorLFM360K.
41
42        Args:
43            dataset_name: path of the dataset directory.
44            dataset_name: name of the dataset (processor).
45        """
46        DatasetProcessorLFM.__init__(self, dataset_dir, dataset_name)
47        # buffer for the user sha and artist name lists
48        self.user_list = None
49        self.artist_list = None
50        # buffer for the artist name/musicbrainzID dataframe
51        self.artist_mb_id = None

Construct the DatasetProcessorLFM360K.

Args: dataset_name: path of the dataset directory. dataset_name: name of the dataset (processor).

def create_listening_events_config( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

53    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
54        """Create the listening event table configuration.
55
56        No listening events are available for this dataset.
57
58        Returns:
59            None.
60        """
61        return None

Create the listening event table configuration.

No listening events are available for this dataset.

Returns: None.

def create_user_table_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig: View Source

63    def create_user_table_config(self) -> DatasetTableConfig:
64        """Create the user table configuration.
65
66        The base user configuration that contains the generated user ids
67        and corresponding user sha.
68
69        Returns:
70            the configuration of the user table.
71        """
72        return create_dataset_table_config(
73            TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2',
74            ['user_id'],
75            ['user_sha'],
76            compression='bz2',
77            num_records=len(self.user_list)
78        )

Create the user table configuration.

The base user configuration that contains the generated user ids and corresponding user sha.

Returns: the configuration of the user table.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]: View Source

80    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
81        """Get matrix configuration processors.
82
83        Returns:
84            a list containing the user-artist-count matrix processor.
85        """
86        return [('user-artist-count', self.process_user_artist_matrix)]

Get matrix configuration processors.

Returns: a list containing the user-artist-count matrix processor.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]: View Source

88    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
89        """Get table configuration processors.
90
91        Returns:
92            a list containing the artist and user table processors.
93        """
94        return DatasetProcessorLFM.get_table_configs(self) + [('artist',self.process_artist_table)]

Get table configuration processors.

Returns: a list containing the artist and user table processors.

def load_artist_gender_json(self) -> Optional[pandas.core.frame.DataFrame]: View Source

 96    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
 97        """Load the artist gender json file.
 98
 99        Returns:
100            the loaded artist musicbrainzID/gender table or None on failure.
101        """
102        try:
103            gender_table = pd.read_json(
104                os.path.join(self.dataset_dir, 'lfm-360-gender.json'),
105                orient='index'
106            )
107            gender_table.reset_index(inplace=True)
108            gender_table.rename(columns={'index': 'artist_mbID', 0: 'artist_gender'}, inplace=True)
109            return gender_table
110        except FileNotFoundError:
111            return None

Load the artist gender json file.

Returns: the loaded artist musicbrainzID/gender table or None on failure.

def load_user_table(self) -> Optional[pandas.core.frame.DataFrame]: View Source

113    def load_user_table(self) -> Optional[pd.DataFrame]:
114        """Load the original user table.
115
116        Changes the contents of the age and gender columns to be more user-friendly,
117        and the contents of the country column to ISO 3166 Alpha-2 country codes.
118
119        Returns:
120            the loaded user table on None on failure.
121        """
122        user_table_columns = [
123            'user_sha',
124            'user_gender',
125            'user_age',
126            'user_country',
127            'user_signup'
128        ]
129
130        try:
131            # load original user table
132            user_table = pd.read_table(
133                os.path.join(self.dataset_dir, 'usersha1-profile.tsv'),
134                names=user_table_columns,
135                sep='\t'
136            )
137        except FileNotFoundError:
138            return None
139
140        # mask user age not between 1-100 as NaN
141        user_table['user_age'].mask(user_table['user_age'].lt(1), inplace=True)
142        user_table['user_age'].mask(user_table['user_age'].gt(100), inplace=True)
143
144        # convert gender to more user-friendly names
145        user_table['user_gender'].replace({'m': 'Male', 'f': 'Female'}, inplace=True)
146
147        # convert country to ISO 3166 Alpha-2 country code
148        user_table['user_country'].replace({
149            'Afghanistan': 'AF', 'Albania': 'AL', 'Algeria': 'DZ', 'American Samoa': 'AS',
150            'Andorra': 'AD', 'Angola': 'AO', 'Anguilla': 'AI', 'Antarctica': 'AQ',
151            'Antigua and Barbuda': 'AG', 'Argentina': 'AR', 'Armenia': 'AM', 'Aruba': 'AW',
152            'Australia': 'AU', 'Austria': 'AT', 'Azerbaijan': 'AZ',
153            'Bahamas': 'BS', 'Bahrain': 'BH', 'Bangladesh': 'BD', 'Barbados': 'BB',
154            'Belarus': 'BY', 'Belgium': 'BE', 'Belize': 'BZ', 'Benin': 'BJ', 'Bermuda': 'BM',
155            'Bhutan': 'BT', 'Bolivia': 'BO', 'Bosnia and Herzegovina': 'BA', 'Botswana': 'BW',
156            'Bouvet Island': 'BV', 'Brazil': 'BR', 'British Indian Ocean Territory': 'IO',
157            'Brunei Darussalam': 'BN', 'Bulgaria': 'BG', 'Burkina Faso': 'BF', 'Burundi': 'BI',
158            'Cambodia': 'KH', 'Cameroon': 'CM', 'Canada': 'CA', 'Cape Verde': 'CV',
159            'Cayman Islands': 'KY', 'Central African Republic': 'CF', 'Chad': 'TD', 'Chile': 'CL',
160            'China': 'CN', 'Christmas Island': 'CX', 'Cocos (Keeling) Islands': 'CC',
161            'Colombia': 'CO', 'Comoros': 'KM', 'Congo': 'CG', 'Czech Republic': 'CZ',
162            'Congo, the Democratic Republic of the': 'CD', 'Cook Islands': 'CK', 'Cyprus': 'CY',
163            'Costa Rica': 'CR', 'Cote D\'Ivoire': 'CI', 'Croatia': 'HR', 'Cuba': 'CU',
164            'Denmark': 'DK', 'Djibouti': 'DJ', 'Dominica': 'DM', 'Dominican Republic': 'DO',
165            'Ecuador': 'EC', 'Egypt': 'EG', 'El Salvador': 'SV', 'Equatorial Guinea': 'GQ',
166            'Eritrea': 'ER', 'Estonia': 'EE', 'Ethiopia': 'ET',
167            'Falkland Islands (Malvinas)': 'FK', 'Faroe Islands': 'FO', 'Fiji': 'FJ',
168            'Finland': 'FI', 'France': 'FR', 'French Guiana': 'GF', 'French Polynesia': 'PF',
169            'French Southern Territories': 'TF',
170            'Gabon': 'GA', 'Gambia': 'GM', 'Georgia': 'GE', 'Germany': 'DE', 'Ghana': 'GH',
171            'Grenada': 'GD', 'Gibraltar': 'GI', 'Greece': 'GR', 'Greenland': 'GL',
172            'Guadeloupe': 'GP', 'Guam': 'GU', 'Guatemala': 'GT', 'Guinea-Bissau': 'GW',
173            'Guyana': 'GY',
174            'Haiti': 'HT', 'Heard Island and Mcdonald Islands': 'HM', 'Hungary': 'HU',
175            'Holy See (Vatican City State)': 'VA', 'Honduras': 'HN', 'Hong Kong': 'HK',
176            'Iceland': 'IS', 'India': 'IN', 'Indonesia': 'ID', 'Iran, Islamic Republic of': 'IR',
177            'Iraq': 'IQ', 'Ireland': 'IE', 'Israel': 'IL', 'Italy': 'IT',
178            'Jamaica': 'JM', 'Japan': 'JP', 'Jordan': 'JO',
179            'Kazakhstan': 'KZ', 'Kenya': 'KE', 'Kiribati': 'KI', 'Kyrgyzstan': 'KG',
180            'Korea, Democratic People\'s Republic of':'KP','Korea, Republic of':'KR','Kuwait':'KW',
181            'Lao People\'s Democratic Republic': 'LA','Latvia': 'LV','Lebanon': 'LB',
182            'Lesotho': 'LS', 'Liberia': 'LR', 'Libyan Arab Jamahiriya': 'LY',
183            'Liechtenstein': 'LI', 'Lithuania': 'LT', 'Luxembourg': 'LU',
184            'Macao': 'MO', 'Macedonia': 'MK', 'Madagascar': 'MG', 'Mali': 'ML', 'Malta': 'MT',
185            'Malaysia': 'MY', 'Malawi': 'MW', 'Maldives': 'MV', 'Mauritania': 'MR',
186            'Mauritius': 'MU', 'Marshall Islands': 'MH', 'Martinique': 'MQ', 'Mayotte': 'YT',
187            'Mexico': 'MX', 'Micronesia, Federated States of': 'FM','Moldova': 'MD','Monaco': 'MC',
188            'Mongolia': 'MN', 'Montenegro': 'ME', 'Montserrat':  'MS', 'Morocco': 'MA',
189            'Mozambique': 'MZ', 'Myanmar': 'MM',
190            'Namibia': 'NA', 'Nauru': 'NR', 'Nepal': 'NP',
191            'Netherlands': 'NL', 'Netherlands Antilles': 'NL', 'New Caledonia': 'NC',
192            'New Zealand': 'NZ', 'Nicaragua': 'NI', 'Niger': 'NE', 'Nigeria': 'NG', 'Niue': 'NU',
193            'Norfolk Island': 'NF', 'Northern Mariana Islands': 'MP', 'Norway': 'NO',
194            'Oman': 'OM',
195            'Pakistan': 'PK', 'Palau': 'PW','Palestinian Territory, Occupied': 'PS','Panama': 'PA',
196            'Papua New Guinea': 'PG', 'Paraguay': 'PY', 'Peru': 'PE', 'Philippines': 'PH',
197            'Pitcairn': 'PN', 'Poland': 'PL', 'Portugal': 'PT', 'Puerto Rico': 'PR',
198            'Qatar': 'QA',
199            'Reunion': 'RE', 'Romania': 'RO', 'Russian Federation': 'RU', 'Rwanda': 'RW',
200            'Saint Helena': 'SH', 'Saint Kitts and Nevis': 'KN', 'Saint Lucia': 'LC',
201            'Saint Pierre and Miquelon': 'PM', 'Saint Vincent and the Grenadines': 'VC',
202            'Samoa': 'WS', 'San Marino': 'SM', 'Sao Tome and Principe': 'ST', 'Saudi Arabia': 'SA',
203            'Senegal': 'SN', 'Serbia': 'RS', 'Seychelles': 'SC', 'Sierra Leone': 'SL',
204            'Singapore': 'SG', 'Solomon Islands': 'SB', 'Somalia': 'SO', 'South Africa': 'ZA',
205            'Slovakia': 'SK','Slovenia': 'SI','South Georgia and the South Sandwich Islands': 'GS',
206            'Spain': 'ES', 'Sri Lanka': 'LK', 'Sudan': 'SD', 'Suriname': 'SR',
207            'Svalbard and Jan Mayen': 'SJ', 'Syrian Arab Republic': 'SY', 'Swaziland': 'SZ',
208            'Sweden': 'SE', 'Switzerland': 'CH',
209            'Taiwan': 'TW', 'Tajikistan': 'TJ', 'Tanzania, United Republic of': 'TZ',
210            'Thailand': 'TH', 'Timor-Leste': 'TL', 'Togo': 'TG', 'Tokelau': 'TK', 'Tonga': 'TO',
211            'Trinidad and Tobago': 'TT', 'Tunisia': 'TN', 'Turkey': 'TR', 'Turkmenistan': 'TM',
212            'Turks and Caicos Islands': 'TC', 'Tuvalu': 'TV',
213            'Uganda': 'UG', 'Ukraine': 'UA', 'United Arab Emirates': 'AE',
214            'United Kingdom':'GB','United States':'US','United States Minor Outlying Islands':'UM',
215            'Uruguay': 'UY', 'Uzbekistan': 'UZ',
216            'Vanuatu': 'VU', 'Venezuela': 'VE', 'Viet Nam': 'VN', 'Virgin Islands, British': 'VG',
217            'Virgin Islands, U.s.': 'VI',
218            'Yemen': 'YE',
219            'Wallis and Futuna': 'WF', 'Western Sahara': 'EH',
220            'Zambia': 'ZM', 'Zimbabwe': 'ZW'
221        }, inplace=True)
222
223        return user_table

Load the original user table.

Changes the contents of the age and gender columns to be more user-friendly, and the contents of the country column to ISO 3166 Alpha-2 country codes.

Returns: the loaded user table on None on failure.

def process_artist_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

225    def process_artist_table(self) -> Optional[DatasetTableConfig]:
226        """Process the artist table.
227
228        Creates the artist table with the musicbrainzID and gender information when available.
229
230        Returns:
231            the artist table configuration or None on failure.
232        """
233        artist_key = ['artist_id']
234        artist_columns = ['artist_name']
235
236        # connect artist id to name
237        artist_table = pd.DataFrame(
238            list(enumerate(self.artist_list)),
239            columns=artist_key + artist_columns
240        )
241
242        # merge the artist musicbrainzID on name
243        artist_table = pd.merge(artist_table, self.artist_mb_id, how='left', on='artist_name')
244        artist_table['artist_mbID'].fillna(-1, inplace=True)
245        artist_columns += ['artist_mbID']
246
247        artist_gender = self.load_artist_gender_json()
248        if artist_gender is not None:
249            # merge artists with gender and update columns
250            artist_table = pd.merge(artist_table, artist_gender, how='left', on='artist_mbID')
251            artist_columns += ['artist_gender']
252
253        # create artist table configuration
254        artist_table_config = create_dataset_table_config(
255            TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2',
256            artist_key,
257            artist_columns,
258            compression='bz2',
259            num_records=len(self.artist_list)
260        )
261
262        # store the generated artist table
263        artist_table_config.save_table(artist_table, self.dataset_dir)
264
265        return artist_table_config

Process the artist table.

Creates the artist table with the musicbrainzID and gender information when available.

Returns: the artist table configuration or None on failure.

def process_user_artist_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]: View Source

267    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
268        """Process the user-artist-count matrix.
269
270        The user-item matrix is stored in a file that also contains a musicbrainzID.
271        The users are hashes and the items are names, both are converted to integers
272        to comply to the CSR compatible format. In addition, any rows that contain
273        corrupt data are removed in the process.
274
275        Returns:
276            the matrix configuration or None on failure.
277        """
278        try:
279            dataframe = pd.read_table(
280                os.path.join(self.dataset_dir, 'usersha1-artmbid-artname-plays.tsv'),
281                names=['user_sha', 'artist_mbID', 'artist_name', 'matrix_count']
282            )
283        except FileNotFoundError:
284            return None
285
286        # remove rows from a user that is not a hash
287        dataframe = dataframe[dataframe['user_sha'] != 'sep 20, 2008']
288
289        # map users/items to category and ratings to be floating-point
290        dataframe['user_sha'] = dataframe['user_sha'].astype("category")
291        dataframe['artist_name'] = dataframe['artist_name'].astype("category")
292        dataframe['matrix_count'] = dataframe['matrix_count'].astype(float)
293
294        # remove rows that contain items that failed to map to category
295        dataframe = dataframe[dataframe['artist_name'].cat.codes >= 0]
296        # remove rows that have unusable ratings
297        dataframe = dataframe[dataframe['matrix_count'] > 0]
298
299        dataframe.drop_duplicates(subset=['user_sha', 'artist_name'], inplace=True)
300
301        # extract user/item indirection arrays
302        self.user_list = list(dataframe['user_sha'].cat.categories)
303        self.artist_list = list(dataframe['artist_name'].cat.categories)
304
305        # extract artist name/musicbrainzID combinations
306        self.artist_mb_id = dataframe[['artist_name', 'artist_mbID']]
307        # remove duplicates combinations
308        self.artist_mb_id = self.artist_mb_id.drop_duplicates().dropna()
309        # remove duplicates where the artist has more than one musicbrainzID
310        self.artist_mb_id = self.artist_mb_id.drop_duplicates(subset='artist_name')
311
312        # add the correct user/item integers
313        dataframe['user_id'] = dataframe['user_sha'].cat.codes.copy()
314        dataframe['artist_id'] = dataframe['artist_name'].cat.codes.copy()
315
316        # create matrix by removing other columns
317        user_artist_matrix = dataframe[['user_id', 'artist_id', 'matrix_count']]
318        user_artist_matrix_table_config = create_dataset_table_config(
319            TABLE_FILE_PREFIX + self.dataset_name + '_user-artist-count_matrix.tsv.bz2',
320            ['user_id', 'artist_id'],
321            ['matrix_count'],
322            compression='bz2',
323            foreign_keys=['user_id', 'artist_id']
324        )
325
326        # store the resulting matrix
327        user_artist_matrix_table_config.save_table(user_artist_matrix, self.dataset_dir)
328
329        return self.process_matrix(user_artist_matrix_table_config)

Process the user-artist-count matrix.

The user-item matrix is stored in a file that also contains a musicbrainzID. The users are hashes and the items are names, both are converted to integers to comply to the CSR compatible format. In addition, any rows that contain corrupt data are removed in the process.

Returns: the matrix configuration or None on failure.

def process_user_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

331    def process_user_table(self) -> Optional[DatasetTableConfig]:
332        """Process the user table.
333
334        Extends the original user table with unique user ids.
335
336        Returns:
337            the user table configuration or None on failure.
338        """
339        user_table_config = self.create_user_table_config()
340        # connect user id to sha
341        user_sha_ids = pd.DataFrame(
342            list(enumerate(self.user_list)),
343            columns=['user_id', 'user_sha']
344        )
345
346        # load original user table and when available, add it to user id/sha
347        user_table = self.load_user_table()
348        if user_table is None:
349            user_table = user_sha_ids
350        else:
351            for i in range(1, len(user_table.columns)):
352                user_table_config.columns += [user_table.columns[i]]
353
354            # join user table with user ids
355            user_table = pd.merge(user_sha_ids, user_table, how='left', on='user_sha')
356
357            # fill unknown user age with -1 and cast back to int
358            user_table['user_age'].fillna(-1.0, inplace=True)
359            user_table['user_age'] = user_table['user_age'].astype(int)
360
361        # store the generated user table
362        user_table_config.save_table(user_table, self.dataset_dir)
363
364        return user_table_config

Process the user table.

Extends the original user table with unique user ids.

Returns: the user table configuration or None on failure.

src.fairreckitlib.data.set.processor.dataset_processor_lfm360k

Inherited Members