src.fairreckitlib.data.set.processor.dataset_processor_lfm1b

This modules contains the class to process the LastFM-1B dataset.

Classes:

DatasetProcessorLFM1B: data processor implementation for the LFM-1B dataset.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This modules contains the class to process the LastFM-1B dataset.
  2
  3Classes:
  4
  5    DatasetProcessorLFM1B: data processor implementation for the LFM-1B dataset.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12import os
 13from typing import Callable, List, Optional, Tuple
 14
 15import h5py
 16import numpy as np
 17import pandas as pd
 18from scipy import sparse
 19
 20from ..dataset_config import DATASET_RATINGS_IMPLICIT, RatingMatrixConfig
 21from ..dataset_config import \
 22    DatasetIndexConfig, DatasetMatrixConfig, DatasetTableConfig, create_dataset_table_config
 23from ..dataset_constants import TABLE_FILE_PREFIX
 24from .dataset_processor_lfm import DatasetProcessorLFM
 25
 26ALL_MUSIC_GENRES = [
 27    'rnb', 'rap', 'electronic', 'rock', 'new age', 'classical', 'reggae', 'blues', 'country',
 28    'world', 'folk', 'easy listening', 'jazz', 'vocal', 'children\'s', 'punk', 'alternative',
 29    'spoken word', 'pop', 'heavy metal'
 30]
 31
 32
 33class DatasetProcessorLFM1B(DatasetProcessorLFM):
 34    """DatasetProcessor for the LastFM-1B dataset.
 35
 36    The dataset and UGP (user genre profile) can be downloaded from the website below.
 37    http://www.cp.jku.at/datasets/LFM-1b/
 38
 39    The enriched artist gender information can be retrieved from:
 40    https://zenodo.org/record/3748787#.YowEBqhByUk
 41
 42    The processor handles the following files:
 43
 44    LFM-1b_albums.txt (optional)
 45    LFM-1b_artist_genres_allmusic.txt (optional)
 46    LFM-1b_artists.txt (optional)
 47    LFM-1b_LEs.mat (required)
 48    LFM-1b_LEs.txt (required)
 49    LFM-1b_tracks.txt (optional)
 50    LFM-1b_UGP_noPC_allmusic.txt (optional)
 51    LFM-1b_UGP_weightedPC_allmusic.txt (optional)
 52    LFM-1b_users.txt (optional)
 53    LFM-1b_users_additional.txt (optional)
 54    lfm-gender.json (optional)
 55    """
 56
 57    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
 58        """Create the listening event table configuration.
 59
 60        Returns:
 61            the configuration of the listening event table.
 62        """
 63        return create_dataset_table_config(
 64            'LFM-1b_LEs.txt',
 65            ['user_id', 'artist_id', 'album_id', 'track_id'],
 66            ['timestamp']
 67        )
 68
 69    def create_user_table_config(self) -> DatasetTableConfig:
 70        """Create the user table configuration.
 71
 72        Returns:
 73            the configuration of the user table.
 74        """
 75        return create_dataset_table_config(
 76            'LFM-1b_users.txt',
 77            ['user_id'],
 78            ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'],
 79            header=True
 80        )
 81
 82    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 83        """Get matrix configuration processors.
 84
 85        Returns:
 86            a list containing the user-artist-count matrix processor.
 87        """
 88        return [('user-artist-count', self.process_user_artist_matrix)]
 89
 90    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 91        """Get table configuration processors.
 92
 93        Returns:
 94            a list containing the album, allmusic genre, artist, track and user table processors.
 95        """
 96        return DatasetProcessorLFM.get_table_configs(self) + [
 97            ('album', self.process_album_table),
 98            ('allmusic genre', self.process_genres_allmusic),
 99            ('artist', self.process_artist_table),
100            ('track', self.process_track_table),
101            ('user additional', self.process_user_additional_table),
102            ('user allmusic noPC', self.process_user_genre_allmusic_no_pc),
103            ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc),
104        ]
105
106    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
107        """Load the artist gender json file.
108
109        Returns:
110            the loaded artist id/gender table or None on failure.
111        """
112        try:
113            gender_table = pd.read_json(
114                os.path.join(self.dataset_dir, 'lfm-gender.json'),
115                orient='index'
116            )
117            gender_table.reset_index(inplace=True)
118            gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True)
119            return gender_table
120        except FileNotFoundError:
121            return None
122
123    def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]:
124        """Load the artist allmusic genres file.
125
126        Returns:
127            the loaded artist name/genre table or None on failure.
128        """
129        try:
130            genres = pd.read_csv(
131                os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'),
132                sep='\t',
133                names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))]
134            )
135        except FileNotFoundError:
136            return None
137
138        # remove duplicate rows where artist name is the same
139        genres.drop_duplicates(subset='artist_name', inplace=True)
140        # extract and drop artist name column
141        artist_genres = pd.DataFrame(genres['artist_name'])
142        genres.drop('artist_name', inplace=True, axis=1)
143
144        # map allmusic genre id to genre name
145        for col in genres:
146            genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore')
147
148        # add genres column
149        artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1)
150
151        return artist_genres
152
153    def process_album_table(self) -> Optional[DatasetTableConfig]:
154        """Process the album table.
155
156        Returns:
157            the album table configuration or None on failure.
158        """
159        album_table_config = create_dataset_table_config(
160            'LFM-1b_albums.txt',
161            ['album_id'],
162            ['album_name'],
163            foreign_keys=['artist_id']
164        )
165
166        try:
167            num_records = len(album_table_config.read_table(self.dataset_dir))
168            album_table_config.num_records = num_records
169            return album_table_config
170        except FileNotFoundError:
171            return None
172
173    def process_artist_table(self) -> Optional[DatasetTableConfig]:
174        """Process the artist table.
175
176        Extends the table with artist gender and genres information when available.
177
178        Returns:
179            the artist table configuration or None on failure.
180        """
181        artist_table_config = create_dataset_table_config(
182            'LFM-1b_artists.txt',
183            ['artist_id'],
184            ['artist_name']
185        )
186
187        try:
188            artist_table = artist_table_config.read_table(self.dataset_dir)
189        except FileNotFoundError:
190            artist_table = pd.DataFrame()
191            artist_table_config.columns.pop()
192
193        # add artist gender when available
194        gender_table = self.load_artist_gender_json()
195        if gender_table is not None:
196            # replace artist table when missing
197            if len(artist_table) == 0:
198                artist_table = gender_table
199            # merge artist table with gender
200            else:
201                artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id')
202            artist_table_config.columns += ['artist_gender']
203
204        # no need to continue if the previous failed
205        if len(artist_table) == 0:
206            return None
207
208        if 'artist_name' in artist_table_config.columns:
209            # attempt to load artist name / genre table
210            artist_genres = self.load_artist_genres_allmusic()
211            if artist_genres is not None:
212                # merge artist table with genres
213                artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name')
214                artist_table_config.columns += ['artist_genres']
215
216        artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2'
217        artist_table_config.file.options.compression = 'bz2'
218        artist_table_config.num_records = len(artist_table)
219
220        # store generated artist table
221        artist_table_config.save_table(artist_table, self.dataset_dir)
222
223        return artist_table_config
224
225    def process_genres_allmusic(self) -> Optional[DatasetTableConfig]:
226        """Process the allmusic genres table.
227
228        Returns:
229            the allmusic genres table configuration or None on failure.
230        """
231        genres_allmusic_table_config = create_dataset_table_config(
232            'genres_allmusic.txt',
233            [], # row number is the primary key
234            ['allmusic_genre']
235        )
236        try:
237            genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir)
238        except FileNotFoundError:
239            return None
240
241        # reset index and rename to primary key
242        genres_allmusic_table.reset_index(inplace=True)
243        genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True)
244
245        genres_allmusic_table_config.primary_key = ['allmusic_id']
246        genres_allmusic_table_config.file.name = \
247            TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2'
248        genres_allmusic_table_config.file.options.compression = 'bz2'
249        genres_allmusic_table_config.num_records = len(genres_allmusic_table)
250
251        # store generated allmusic genre table
252        genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir)
253
254        return genres_allmusic_table_config
255
256
257    def process_track_table(self) -> Optional[DatasetTableConfig]:
258        """Process the track table.
259
260        Returns:
261            the track table configuration or None on failure.
262        """
263        track_table_config = create_dataset_table_config(
264            'LFM-1b_tracks.txt',
265            ['track_id'],
266            ['track_name'],
267            foreign_keys=['artist_id']
268        )
269
270        try:
271            num_records = len(track_table_config.read_table(self.dataset_dir))
272            track_table_config.num_records = num_records
273            return track_table_config
274        except FileNotFoundError:
275            return None
276
277    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
278        """Process the user-artist-count matrix.
279
280        The user-item matrix is stored in a matlab file in CSR compatible format,
281        together with the user and item indices. The matrix is converted
282        to a dataframe and the indices for the indirection arrays are flattened.
283
284        Returns:
285            the matrix configuration or None on failure.
286        """
287        try:
288            mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat')
289            # load matrix as described in the paper
290            csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file)
291        except FileNotFoundError:
292            return None
293
294        matrix_name = 'user-artist-count'
295
296        # create and save user indirection array
297        user_list = list(map(lambda i: i[0], idx_users))
298        user_index_config = DatasetIndexConfig(
299            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5',
300            'user_id',
301            len(user_list)
302        )
303        user_index_config.save_indices(self.dataset_dir, user_list)
304
305        # create and save artist indirection array
306        artist_list = list(map(lambda i: i[0], idx_artists))
307        artist_index_config = DatasetIndexConfig(
308            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5',
309            'artist_id',
310            len(artist_list)
311        )
312        artist_index_config.save_indices(self.dataset_dir, artist_list)
313
314        # convert csr to dataframe
315        coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo()
316        user_artist_matrix = pd.DataFrame()
317        user_artist_matrix['user_id'] = coo_matrix.row
318        user_artist_matrix['artist_id'] = coo_matrix.col
319        user_artist_matrix['matrix_count'] = coo_matrix.data
320
321        # create matrix table configuration
322        user_artist_table_config = create_dataset_table_config(
323            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2',
324            ['user_id', 'artist_id'],
325            ['matrix_count'],
326            compression='bz2',
327            foreign_keys=['user_id', 'artist_id'],
328            num_records=len(user_artist_matrix)
329        )
330
331        # store the resulting matrix
332        user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir)
333
334        return DatasetMatrixConfig(
335            user_artist_table_config,
336            RatingMatrixConfig(
337                user_artist_matrix['matrix_count'].min(),
338                user_artist_matrix['matrix_count'].max(),
339                DATASET_RATINGS_IMPLICIT
340            ),
341            user_index_config,
342            artist_index_config
343        )
344
345    def process_user_additional_table(self) -> Optional[DatasetTableConfig]:
346        """Process the user additional table.
347
348        Returns:
349            the user additional table configuration or None on failure.
350        """
351        columns = [
352            'user_novelty artist avg month',
353            'user_novelty artist avg 6months',
354            'user_novelty artist avg year',
355            'user_mainstreaminess avg month',
356            'user_mainstreaminess avg 6months',
357            'user_mainstreaminess avg year',
358            'user_mainstreaminess global',
359            'user_count LEs',
360            'user_count distinct tracks',
361            'user_count distinct artists',
362            'user_count LEs per week'
363        ]
364
365        for i in range(1, 8):
366            columns += ['user_relative LE per weekday' + str(i)]
367        for i in range(0, 24):
368            columns += ['user_relative LE per hour' + str(i)]
369
370        user_additional_table_config = create_dataset_table_config(
371            'LFM-1b_users_additional.txt',
372            ['user_id'],
373            columns,
374            header=True
375        )
376
377        try:
378            num_records = len(user_additional_table_config.read_table(self.dataset_dir))
379            user_additional_table_config.num_records = num_records
380            return user_additional_table_config
381        except FileNotFoundError:
382            return None
383
384    def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]:
385        """Process the user allmusic genre table.
386
387        Returns:
388            the user allmusic genre table configuration or None on failure.
389        """
390        columns = []
391        for genre_name in ALL_MUSIC_GENRES:
392            columns += ['noPC_' + genre_name]
393
394        user_genre_allmusic_no_pc_config = create_dataset_table_config(
395            'LFM-1b_UGP_noPC_allmusic.txt',
396            ['user_id'],
397            columns,
398            header=True
399        )
400        try:
401            num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir))
402            user_genre_allmusic_no_pc_config.num_records = num_records
403            return user_genre_allmusic_no_pc_config
404        except FileNotFoundError:
405            return None
406
407    def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]:
408        """Process the user allmusic genre table with weighted play count.
409
410        Returns:
411            the user allmusic genre table configuration or None on failure.
412        """
413        columns = []
414        for genre_name in ALL_MUSIC_GENRES:
415            columns += ['weightedPC_' + genre_name]
416
417        user_genre_allmusic_weighted_pc_config = create_dataset_table_config(
418            'LFM-1b_UGP_noPC_allmusic.txt',
419            ['user_id'],
420            columns,
421            header=True
422        )
423        try:
424            num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir))
425            user_genre_allmusic_weighted_pc_config.num_records = num_records
426            return user_genre_allmusic_weighted_pc_config
427        except FileNotFoundError:
428            return None
429
430
431def _load_lfm_1b_mat(file_path: str) -> Tuple[sparse.csr_matrix, np.array, np.array]:
432    """Load the LFM-1B dataset from the matlab file.
433
434    Args:
435        file_path: the path to the matlab file.
436
437    Returns:
438        the matrix and user / artist indirection arrays.
439    """
440    with h5py.File(file_path, 'r') as mat_file:
441        csr_matrix = sparse.csr_matrix((
442            mat_file['/LEs/']["data"],
443            mat_file['/LEs/']["ir"],
444            mat_file['/LEs/']["jc"]
445        )).transpose()
446        idx_users = np.array(mat_file.get('idx_users')).astype(np.int64)
447        idx_artists = np.array(mat_file.get('idx_artists')).astype(np.int64)
448        return csr_matrix, idx_users, idx_artists
 34class DatasetProcessorLFM1B(DatasetProcessorLFM):
 35    """DatasetProcessor for the LastFM-1B dataset.
 36
 37    The dataset and UGP (user genre profile) can be downloaded from the website below.
 38    http://www.cp.jku.at/datasets/LFM-1b/
 39
 40    The enriched artist gender information can be retrieved from:
 41    https://zenodo.org/record/3748787#.YowEBqhByUk
 42
 43    The processor handles the following files:
 44
 45    LFM-1b_albums.txt (optional)
 46    LFM-1b_artist_genres_allmusic.txt (optional)
 47    LFM-1b_artists.txt (optional)
 48    LFM-1b_LEs.mat (required)
 49    LFM-1b_LEs.txt (required)
 50    LFM-1b_tracks.txt (optional)
 51    LFM-1b_UGP_noPC_allmusic.txt (optional)
 52    LFM-1b_UGP_weightedPC_allmusic.txt (optional)
 53    LFM-1b_users.txt (optional)
 54    LFM-1b_users_additional.txt (optional)
 55    lfm-gender.json (optional)
 56    """
 57
 58    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
 59        """Create the listening event table configuration.
 60
 61        Returns:
 62            the configuration of the listening event table.
 63        """
 64        return create_dataset_table_config(
 65            'LFM-1b_LEs.txt',
 66            ['user_id', 'artist_id', 'album_id', 'track_id'],
 67            ['timestamp']
 68        )
 69
 70    def create_user_table_config(self) -> DatasetTableConfig:
 71        """Create the user table configuration.
 72
 73        Returns:
 74            the configuration of the user table.
 75        """
 76        return create_dataset_table_config(
 77            'LFM-1b_users.txt',
 78            ['user_id'],
 79            ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'],
 80            header=True
 81        )
 82
 83    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 84        """Get matrix configuration processors.
 85
 86        Returns:
 87            a list containing the user-artist-count matrix processor.
 88        """
 89        return [('user-artist-count', self.process_user_artist_matrix)]
 90
 91    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 92        """Get table configuration processors.
 93
 94        Returns:
 95            a list containing the album, allmusic genre, artist, track and user table processors.
 96        """
 97        return DatasetProcessorLFM.get_table_configs(self) + [
 98            ('album', self.process_album_table),
 99            ('allmusic genre', self.process_genres_allmusic),
100            ('artist', self.process_artist_table),
101            ('track', self.process_track_table),
102            ('user additional', self.process_user_additional_table),
103            ('user allmusic noPC', self.process_user_genre_allmusic_no_pc),
104            ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc),
105        ]
106
107    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
108        """Load the artist gender json file.
109
110        Returns:
111            the loaded artist id/gender table or None on failure.
112        """
113        try:
114            gender_table = pd.read_json(
115                os.path.join(self.dataset_dir, 'lfm-gender.json'),
116                orient='index'
117            )
118            gender_table.reset_index(inplace=True)
119            gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True)
120            return gender_table
121        except FileNotFoundError:
122            return None
123
124    def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]:
125        """Load the artist allmusic genres file.
126
127        Returns:
128            the loaded artist name/genre table or None on failure.
129        """
130        try:
131            genres = pd.read_csv(
132                os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'),
133                sep='\t',
134                names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))]
135            )
136        except FileNotFoundError:
137            return None
138
139        # remove duplicate rows where artist name is the same
140        genres.drop_duplicates(subset='artist_name', inplace=True)
141        # extract and drop artist name column
142        artist_genres = pd.DataFrame(genres['artist_name'])
143        genres.drop('artist_name', inplace=True, axis=1)
144
145        # map allmusic genre id to genre name
146        for col in genres:
147            genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore')
148
149        # add genres column
150        artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1)
151
152        return artist_genres
153
154    def process_album_table(self) -> Optional[DatasetTableConfig]:
155        """Process the album table.
156
157        Returns:
158            the album table configuration or None on failure.
159        """
160        album_table_config = create_dataset_table_config(
161            'LFM-1b_albums.txt',
162            ['album_id'],
163            ['album_name'],
164            foreign_keys=['artist_id']
165        )
166
167        try:
168            num_records = len(album_table_config.read_table(self.dataset_dir))
169            album_table_config.num_records = num_records
170            return album_table_config
171        except FileNotFoundError:
172            return None
173
174    def process_artist_table(self) -> Optional[DatasetTableConfig]:
175        """Process the artist table.
176
177        Extends the table with artist gender and genres information when available.
178
179        Returns:
180            the artist table configuration or None on failure.
181        """
182        artist_table_config = create_dataset_table_config(
183            'LFM-1b_artists.txt',
184            ['artist_id'],
185            ['artist_name']
186        )
187
188        try:
189            artist_table = artist_table_config.read_table(self.dataset_dir)
190        except FileNotFoundError:
191            artist_table = pd.DataFrame()
192            artist_table_config.columns.pop()
193
194        # add artist gender when available
195        gender_table = self.load_artist_gender_json()
196        if gender_table is not None:
197            # replace artist table when missing
198            if len(artist_table) == 0:
199                artist_table = gender_table
200            # merge artist table with gender
201            else:
202                artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id')
203            artist_table_config.columns += ['artist_gender']
204
205        # no need to continue if the previous failed
206        if len(artist_table) == 0:
207            return None
208
209        if 'artist_name' in artist_table_config.columns:
210            # attempt to load artist name / genre table
211            artist_genres = self.load_artist_genres_allmusic()
212            if artist_genres is not None:
213                # merge artist table with genres
214                artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name')
215                artist_table_config.columns += ['artist_genres']
216
217        artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2'
218        artist_table_config.file.options.compression = 'bz2'
219        artist_table_config.num_records = len(artist_table)
220
221        # store generated artist table
222        artist_table_config.save_table(artist_table, self.dataset_dir)
223
224        return artist_table_config
225
226    def process_genres_allmusic(self) -> Optional[DatasetTableConfig]:
227        """Process the allmusic genres table.
228
229        Returns:
230            the allmusic genres table configuration or None on failure.
231        """
232        genres_allmusic_table_config = create_dataset_table_config(
233            'genres_allmusic.txt',
234            [], # row number is the primary key
235            ['allmusic_genre']
236        )
237        try:
238            genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir)
239        except FileNotFoundError:
240            return None
241
242        # reset index and rename to primary key
243        genres_allmusic_table.reset_index(inplace=True)
244        genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True)
245
246        genres_allmusic_table_config.primary_key = ['allmusic_id']
247        genres_allmusic_table_config.file.name = \
248            TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2'
249        genres_allmusic_table_config.file.options.compression = 'bz2'
250        genres_allmusic_table_config.num_records = len(genres_allmusic_table)
251
252        # store generated allmusic genre table
253        genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir)
254
255        return genres_allmusic_table_config
256
257
258    def process_track_table(self) -> Optional[DatasetTableConfig]:
259        """Process the track table.
260
261        Returns:
262            the track table configuration or None on failure.
263        """
264        track_table_config = create_dataset_table_config(
265            'LFM-1b_tracks.txt',
266            ['track_id'],
267            ['track_name'],
268            foreign_keys=['artist_id']
269        )
270
271        try:
272            num_records = len(track_table_config.read_table(self.dataset_dir))
273            track_table_config.num_records = num_records
274            return track_table_config
275        except FileNotFoundError:
276            return None
277
278    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
279        """Process the user-artist-count matrix.
280
281        The user-item matrix is stored in a matlab file in CSR compatible format,
282        together with the user and item indices. The matrix is converted
283        to a dataframe and the indices for the indirection arrays are flattened.
284
285        Returns:
286            the matrix configuration or None on failure.
287        """
288        try:
289            mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat')
290            # load matrix as described in the paper
291            csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file)
292        except FileNotFoundError:
293            return None
294
295        matrix_name = 'user-artist-count'
296
297        # create and save user indirection array
298        user_list = list(map(lambda i: i[0], idx_users))
299        user_index_config = DatasetIndexConfig(
300            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5',
301            'user_id',
302            len(user_list)
303        )
304        user_index_config.save_indices(self.dataset_dir, user_list)
305
306        # create and save artist indirection array
307        artist_list = list(map(lambda i: i[0], idx_artists))
308        artist_index_config = DatasetIndexConfig(
309            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5',
310            'artist_id',
311            len(artist_list)
312        )
313        artist_index_config.save_indices(self.dataset_dir, artist_list)
314
315        # convert csr to dataframe
316        coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo()
317        user_artist_matrix = pd.DataFrame()
318        user_artist_matrix['user_id'] = coo_matrix.row
319        user_artist_matrix['artist_id'] = coo_matrix.col
320        user_artist_matrix['matrix_count'] = coo_matrix.data
321
322        # create matrix table configuration
323        user_artist_table_config = create_dataset_table_config(
324            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2',
325            ['user_id', 'artist_id'],
326            ['matrix_count'],
327            compression='bz2',
328            foreign_keys=['user_id', 'artist_id'],
329            num_records=len(user_artist_matrix)
330        )
331
332        # store the resulting matrix
333        user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir)
334
335        return DatasetMatrixConfig(
336            user_artist_table_config,
337            RatingMatrixConfig(
338                user_artist_matrix['matrix_count'].min(),
339                user_artist_matrix['matrix_count'].max(),
340                DATASET_RATINGS_IMPLICIT
341            ),
342            user_index_config,
343            artist_index_config
344        )
345
346    def process_user_additional_table(self) -> Optional[DatasetTableConfig]:
347        """Process the user additional table.
348
349        Returns:
350            the user additional table configuration or None on failure.
351        """
352        columns = [
353            'user_novelty artist avg month',
354            'user_novelty artist avg 6months',
355            'user_novelty artist avg year',
356            'user_mainstreaminess avg month',
357            'user_mainstreaminess avg 6months',
358            'user_mainstreaminess avg year',
359            'user_mainstreaminess global',
360            'user_count LEs',
361            'user_count distinct tracks',
362            'user_count distinct artists',
363            'user_count LEs per week'
364        ]
365
366        for i in range(1, 8):
367            columns += ['user_relative LE per weekday' + str(i)]
368        for i in range(0, 24):
369            columns += ['user_relative LE per hour' + str(i)]
370
371        user_additional_table_config = create_dataset_table_config(
372            'LFM-1b_users_additional.txt',
373            ['user_id'],
374            columns,
375            header=True
376        )
377
378        try:
379            num_records = len(user_additional_table_config.read_table(self.dataset_dir))
380            user_additional_table_config.num_records = num_records
381            return user_additional_table_config
382        except FileNotFoundError:
383            return None
384
385    def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]:
386        """Process the user allmusic genre table.
387
388        Returns:
389            the user allmusic genre table configuration or None on failure.
390        """
391        columns = []
392        for genre_name in ALL_MUSIC_GENRES:
393            columns += ['noPC_' + genre_name]
394
395        user_genre_allmusic_no_pc_config = create_dataset_table_config(
396            'LFM-1b_UGP_noPC_allmusic.txt',
397            ['user_id'],
398            columns,
399            header=True
400        )
401        try:
402            num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir))
403            user_genre_allmusic_no_pc_config.num_records = num_records
404            return user_genre_allmusic_no_pc_config
405        except FileNotFoundError:
406            return None
407
408    def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]:
409        """Process the user allmusic genre table with weighted play count.
410
411        Returns:
412            the user allmusic genre table configuration or None on failure.
413        """
414        columns = []
415        for genre_name in ALL_MUSIC_GENRES:
416            columns += ['weightedPC_' + genre_name]
417
418        user_genre_allmusic_weighted_pc_config = create_dataset_table_config(
419            'LFM-1b_UGP_noPC_allmusic.txt',
420            ['user_id'],
421            columns,
422            header=True
423        )
424        try:
425            num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir))
426            user_genre_allmusic_weighted_pc_config.num_records = num_records
427            return user_genre_allmusic_weighted_pc_config
428        except FileNotFoundError:
429            return None

DatasetProcessor for the LastFM-1B dataset.

The dataset and UGP (user genre profile) can be downloaded from the website below. http://www.cp.jku.at/datasets/LFM-1b/

The enriched artist gender information can be retrieved from: https://zenodo.org/record/3748787#.YowEBqhByUk

The processor handles the following files:

LFM-1b_albums.txt (optional) LFM-1b_artist_genres_allmusic.txt (optional) LFM-1b_artists.txt (optional) LFM-1b_LEs.mat (required) LFM-1b_LEs.txt (required) LFM-1b_tracks.txt (optional) LFM-1b_UGP_noPC_allmusic.txt (optional) LFM-1b_UGP_weightedPC_allmusic.txt (optional) LFM-1b_users.txt (optional) LFM-1b_users_additional.txt (optional) lfm-gender.json (optional)

def create_listening_events_config( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
58    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
59        """Create the listening event table configuration.
60
61        Returns:
62            the configuration of the listening event table.
63        """
64        return create_dataset_table_config(
65            'LFM-1b_LEs.txt',
66            ['user_id', 'artist_id', 'album_id', 'track_id'],
67            ['timestamp']
68        )

Create the listening event table configuration.

Returns: the configuration of the listening event table.

def create_user_table_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
70    def create_user_table_config(self) -> DatasetTableConfig:
71        """Create the user table configuration.
72
73        Returns:
74            the configuration of the user table.
75        """
76        return create_dataset_table_config(
77            'LFM-1b_users.txt',
78            ['user_id'],
79            ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'],
80            header=True
81        )

Create the user table configuration.

Returns: the configuration of the user table.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]:
83    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
84        """Get matrix configuration processors.
85
86        Returns:
87            a list containing the user-artist-count matrix processor.
88        """
89        return [('user-artist-count', self.process_user_artist_matrix)]

Get matrix configuration processors.

Returns: a list containing the user-artist-count matrix processor.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
 91    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 92        """Get table configuration processors.
 93
 94        Returns:
 95            a list containing the album, allmusic genre, artist, track and user table processors.
 96        """
 97        return DatasetProcessorLFM.get_table_configs(self) + [
 98            ('album', self.process_album_table),
 99            ('allmusic genre', self.process_genres_allmusic),
100            ('artist', self.process_artist_table),
101            ('track', self.process_track_table),
102            ('user additional', self.process_user_additional_table),
103            ('user allmusic noPC', self.process_user_genre_allmusic_no_pc),
104            ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc),
105        ]

Get table configuration processors.

Returns: a list containing the album, allmusic genre, artist, track and user table processors.

def load_artist_gender_json(self) -> Optional[pandas.core.frame.DataFrame]:
107    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
108        """Load the artist gender json file.
109
110        Returns:
111            the loaded artist id/gender table or None on failure.
112        """
113        try:
114            gender_table = pd.read_json(
115                os.path.join(self.dataset_dir, 'lfm-gender.json'),
116                orient='index'
117            )
118            gender_table.reset_index(inplace=True)
119            gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True)
120            return gender_table
121        except FileNotFoundError:
122            return None

Load the artist gender json file.

Returns: the loaded artist id/gender table or None on failure.

def load_artist_genres_allmusic(self) -> Optional[pandas.core.frame.DataFrame]:
124    def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]:
125        """Load the artist allmusic genres file.
126
127        Returns:
128            the loaded artist name/genre table or None on failure.
129        """
130        try:
131            genres = pd.read_csv(
132                os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'),
133                sep='\t',
134                names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))]
135            )
136        except FileNotFoundError:
137            return None
138
139        # remove duplicate rows where artist name is the same
140        genres.drop_duplicates(subset='artist_name', inplace=True)
141        # extract and drop artist name column
142        artist_genres = pd.DataFrame(genres['artist_name'])
143        genres.drop('artist_name', inplace=True, axis=1)
144
145        # map allmusic genre id to genre name
146        for col in genres:
147            genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore')
148
149        # add genres column
150        artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1)
151
152        return artist_genres

Load the artist allmusic genres file.

Returns: the loaded artist name/genre table or None on failure.

def process_album_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
154    def process_album_table(self) -> Optional[DatasetTableConfig]:
155        """Process the album table.
156
157        Returns:
158            the album table configuration or None on failure.
159        """
160        album_table_config = create_dataset_table_config(
161            'LFM-1b_albums.txt',
162            ['album_id'],
163            ['album_name'],
164            foreign_keys=['artist_id']
165        )
166
167        try:
168            num_records = len(album_table_config.read_table(self.dataset_dir))
169            album_table_config.num_records = num_records
170            return album_table_config
171        except FileNotFoundError:
172            return None

Process the album table.

Returns: the album table configuration or None on failure.

def process_artist_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
174    def process_artist_table(self) -> Optional[DatasetTableConfig]:
175        """Process the artist table.
176
177        Extends the table with artist gender and genres information when available.
178
179        Returns:
180            the artist table configuration or None on failure.
181        """
182        artist_table_config = create_dataset_table_config(
183            'LFM-1b_artists.txt',
184            ['artist_id'],
185            ['artist_name']
186        )
187
188        try:
189            artist_table = artist_table_config.read_table(self.dataset_dir)
190        except FileNotFoundError:
191            artist_table = pd.DataFrame()
192            artist_table_config.columns.pop()
193
194        # add artist gender when available
195        gender_table = self.load_artist_gender_json()
196        if gender_table is not None:
197            # replace artist table when missing
198            if len(artist_table) == 0:
199                artist_table = gender_table
200            # merge artist table with gender
201            else:
202                artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id')
203            artist_table_config.columns += ['artist_gender']
204
205        # no need to continue if the previous failed
206        if len(artist_table) == 0:
207            return None
208
209        if 'artist_name' in artist_table_config.columns:
210            # attempt to load artist name / genre table
211            artist_genres = self.load_artist_genres_allmusic()
212            if artist_genres is not None:
213                # merge artist table with genres
214                artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name')
215                artist_table_config.columns += ['artist_genres']
216
217        artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2'
218        artist_table_config.file.options.compression = 'bz2'
219        artist_table_config.num_records = len(artist_table)
220
221        # store generated artist table
222        artist_table_config.save_table(artist_table, self.dataset_dir)
223
224        return artist_table_config

Process the artist table.

Extends the table with artist gender and genres information when available.

Returns: the artist table configuration or None on failure.

def process_genres_allmusic( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
226    def process_genres_allmusic(self) -> Optional[DatasetTableConfig]:
227        """Process the allmusic genres table.
228
229        Returns:
230            the allmusic genres table configuration or None on failure.
231        """
232        genres_allmusic_table_config = create_dataset_table_config(
233            'genres_allmusic.txt',
234            [], # row number is the primary key
235            ['allmusic_genre']
236        )
237        try:
238            genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir)
239        except FileNotFoundError:
240            return None
241
242        # reset index and rename to primary key
243        genres_allmusic_table.reset_index(inplace=True)
244        genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True)
245
246        genres_allmusic_table_config.primary_key = ['allmusic_id']
247        genres_allmusic_table_config.file.name = \
248            TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2'
249        genres_allmusic_table_config.file.options.compression = 'bz2'
250        genres_allmusic_table_config.num_records = len(genres_allmusic_table)
251
252        # store generated allmusic genre table
253        genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir)
254
255        return genres_allmusic_table_config

Process the allmusic genres table.

Returns: the allmusic genres table configuration or None on failure.

def process_track_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
258    def process_track_table(self) -> Optional[DatasetTableConfig]:
259        """Process the track table.
260
261        Returns:
262            the track table configuration or None on failure.
263        """
264        track_table_config = create_dataset_table_config(
265            'LFM-1b_tracks.txt',
266            ['track_id'],
267            ['track_name'],
268            foreign_keys=['artist_id']
269        )
270
271        try:
272            num_records = len(track_table_config.read_table(self.dataset_dir))
273            track_table_config.num_records = num_records
274            return track_table_config
275        except FileNotFoundError:
276            return None

Process the track table.

Returns: the track table configuration or None on failure.

def process_user_artist_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]:
278    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
279        """Process the user-artist-count matrix.
280
281        The user-item matrix is stored in a matlab file in CSR compatible format,
282        together with the user and item indices. The matrix is converted
283        to a dataframe and the indices for the indirection arrays are flattened.
284
285        Returns:
286            the matrix configuration or None on failure.
287        """
288        try:
289            mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat')
290            # load matrix as described in the paper
291            csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file)
292        except FileNotFoundError:
293            return None
294
295        matrix_name = 'user-artist-count'
296
297        # create and save user indirection array
298        user_list = list(map(lambda i: i[0], idx_users))
299        user_index_config = DatasetIndexConfig(
300            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5',
301            'user_id',
302            len(user_list)
303        )
304        user_index_config.save_indices(self.dataset_dir, user_list)
305
306        # create and save artist indirection array
307        artist_list = list(map(lambda i: i[0], idx_artists))
308        artist_index_config = DatasetIndexConfig(
309            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5',
310            'artist_id',
311            len(artist_list)
312        )
313        artist_index_config.save_indices(self.dataset_dir, artist_list)
314
315        # convert csr to dataframe
316        coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo()
317        user_artist_matrix = pd.DataFrame()
318        user_artist_matrix['user_id'] = coo_matrix.row
319        user_artist_matrix['artist_id'] = coo_matrix.col
320        user_artist_matrix['matrix_count'] = coo_matrix.data
321
322        # create matrix table configuration
323        user_artist_table_config = create_dataset_table_config(
324            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2',
325            ['user_id', 'artist_id'],
326            ['matrix_count'],
327            compression='bz2',
328            foreign_keys=['user_id', 'artist_id'],
329            num_records=len(user_artist_matrix)
330        )
331
332        # store the resulting matrix
333        user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir)
334
335        return DatasetMatrixConfig(
336            user_artist_table_config,
337            RatingMatrixConfig(
338                user_artist_matrix['matrix_count'].min(),
339                user_artist_matrix['matrix_count'].max(),
340                DATASET_RATINGS_IMPLICIT
341            ),
342            user_index_config,
343            artist_index_config
344        )

Process the user-artist-count matrix.

The user-item matrix is stored in a matlab file in CSR compatible format, together with the user and item indices. The matrix is converted to a dataframe and the indices for the indirection arrays are flattened.

Returns: the matrix configuration or None on failure.

def process_user_additional_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
346    def process_user_additional_table(self) -> Optional[DatasetTableConfig]:
347        """Process the user additional table.
348
349        Returns:
350            the user additional table configuration or None on failure.
351        """
352        columns = [
353            'user_novelty artist avg month',
354            'user_novelty artist avg 6months',
355            'user_novelty artist avg year',
356            'user_mainstreaminess avg month',
357            'user_mainstreaminess avg 6months',
358            'user_mainstreaminess avg year',
359            'user_mainstreaminess global',
360            'user_count LEs',
361            'user_count distinct tracks',
362            'user_count distinct artists',
363            'user_count LEs per week'
364        ]
365
366        for i in range(1, 8):
367            columns += ['user_relative LE per weekday' + str(i)]
368        for i in range(0, 24):
369            columns += ['user_relative LE per hour' + str(i)]
370
371        user_additional_table_config = create_dataset_table_config(
372            'LFM-1b_users_additional.txt',
373            ['user_id'],
374            columns,
375            header=True
376        )
377
378        try:
379            num_records = len(user_additional_table_config.read_table(self.dataset_dir))
380            user_additional_table_config.num_records = num_records
381            return user_additional_table_config
382        except FileNotFoundError:
383            return None

Process the user additional table.

Returns: the user additional table configuration or None on failure.

def process_user_genre_allmusic_no_pc( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
385    def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]:
386        """Process the user allmusic genre table.
387
388        Returns:
389            the user allmusic genre table configuration or None on failure.
390        """
391        columns = []
392        for genre_name in ALL_MUSIC_GENRES:
393            columns += ['noPC_' + genre_name]
394
395        user_genre_allmusic_no_pc_config = create_dataset_table_config(
396            'LFM-1b_UGP_noPC_allmusic.txt',
397            ['user_id'],
398            columns,
399            header=True
400        )
401        try:
402            num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir))
403            user_genre_allmusic_no_pc_config.num_records = num_records
404            return user_genre_allmusic_no_pc_config
405        except FileNotFoundError:
406            return None

Process the user allmusic genre table.

Returns: the user allmusic genre table configuration or None on failure.

def process_user_genre_allmusic_weighted_pc( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
408    def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]:
409        """Process the user allmusic genre table with weighted play count.
410
411        Returns:
412            the user allmusic genre table configuration or None on failure.
413        """
414        columns = []
415        for genre_name in ALL_MUSIC_GENRES:
416            columns += ['weightedPC_' + genre_name]
417
418        user_genre_allmusic_weighted_pc_config = create_dataset_table_config(
419            'LFM-1b_UGP_noPC_allmusic.txt',
420            ['user_id'],
421            columns,
422            header=True
423        )
424        try:
425            num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir))
426            user_genre_allmusic_weighted_pc_config.num_records = num_records
427            return user_genre_allmusic_weighted_pc_config
428        except FileNotFoundError:
429            return None

Process the user allmusic genre table with weighted play count.

Returns: the user allmusic genre table configuration or None on failure.