src.fairreckitlib.data.set.processor.dataset_processor

def create_listening_events_config( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

58    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
59        """Create the listening event table configuration.
60
61        Returns:
62            the configuration of the listening event table.
63        """
64        return create_dataset_table_config(
65            'LFM-1b_LEs.txt',
66            ['user_id', 'artist_id', 'album_id', 'track_id'],
67            ['timestamp']
68        )

Create the listening event table configuration.

Returns: the configuration of the listening event table.

def create_user_table_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig: View Source

70    def create_user_table_config(self) -> DatasetTableConfig:
71        """Create the user table configuration.
72
73        Returns:
74            the configuration of the user table.
75        """
76        return create_dataset_table_config(
77            'LFM-1b_users.txt',
78            ['user_id'],
79            ['user_country', 'user_age', 'user_gender', 'user_plays', 'user_registered'],
80            header=True
81        )

Create the user table configuration.

Returns: the configuration of the user table.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]: View Source

83    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
84        """Get matrix configuration processors.
85
86        Returns:
87            a list containing the user-artist-count matrix processor.
88        """
89        return [('user-artist-count', self.process_user_artist_matrix)]

Get matrix configuration processors.

Returns: a list containing the user-artist-count matrix processor.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]: View Source

 91    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 92        """Get table configuration processors.
 93
 94        Returns:
 95            a list containing the album, allmusic genre, artist, track and user table processors.
 96        """
 97        return DatasetProcessorLFM.get_table_configs(self) + [
 98            ('album', self.process_album_table),
 99            ('allmusic genre', self.process_genres_allmusic),
100            ('artist', self.process_artist_table),
101            ('track', self.process_track_table),
102            ('user additional', self.process_user_additional_table),
103            ('user allmusic noPC', self.process_user_genre_allmusic_no_pc),
104            ('user allmusic weightedPC', self.process_user_genre_allmusic_weighted_pc),
105        ]

Get table configuration processors.

Returns: a list containing the album, allmusic genre, artist, track and user table processors.

def load_artist_gender_json(self) -> Optional[pandas.core.frame.DataFrame]: View Source

107    def load_artist_gender_json(self) -> Optional[pd.DataFrame]:
108        """Load the artist gender json file.
109
110        Returns:
111            the loaded artist id/gender table or None on failure.
112        """
113        try:
114            gender_table = pd.read_json(
115                os.path.join(self.dataset_dir, 'lfm-gender.json'),
116                orient='index'
117            )
118            gender_table.reset_index(inplace=True)
119            gender_table.rename(columns={'index': 'artist_id', 0: 'artist_gender'}, inplace=True)
120            return gender_table
121        except FileNotFoundError:
122            return None

Load the artist gender json file.

Returns: the loaded artist id/gender table or None on failure.

def load_artist_genres_allmusic(self) -> Optional[pandas.core.frame.DataFrame]: View Source

124    def load_artist_genres_allmusic(self) -> Optional[pd.DataFrame]:
125        """Load the artist allmusic genres file.
126
127        Returns:
128            the loaded artist name/genre table or None on failure.
129        """
130        try:
131            genres = pd.read_csv(
132                os.path.join(self.dataset_dir, 'LFM-1b_artist_genres_allmusic.txt'),
133                sep='\t',
134                names=['artist_name'] + [str(i) for i in range(0, len(ALL_MUSIC_GENRES))]
135            )
136        except FileNotFoundError:
137            return None
138
139        # remove duplicate rows where artist name is the same
140        genres.drop_duplicates(subset='artist_name', inplace=True)
141        # extract and drop artist name column
142        artist_genres = pd.DataFrame(genres['artist_name'])
143        genres.drop('artist_name', inplace=True, axis=1)
144
145        # map allmusic genre id to genre name
146        for col in genres:
147            genres[col] = genres[col].map(lambda i: ALL_MUSIC_GENRES[int(i)], na_action='ignore')
148
149        # add genres column
150        artist_genres['artist_genres'] = genres.apply(lambda x: x.str.cat(sep='|'), axis=1)
151
152        return artist_genres

Load the artist allmusic genres file.

Returns: the loaded artist name/genre table or None on failure.

def process_album_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

154    def process_album_table(self) -> Optional[DatasetTableConfig]:
155        """Process the album table.
156
157        Returns:
158            the album table configuration or None on failure.
159        """
160        album_table_config = create_dataset_table_config(
161            'LFM-1b_albums.txt',
162            ['album_id'],
163            ['album_name'],
164            foreign_keys=['artist_id']
165        )
166
167        try:
168            num_records = len(album_table_config.read_table(self.dataset_dir))
169            album_table_config.num_records = num_records
170            return album_table_config
171        except FileNotFoundError:
172            return None

Process the album table.

Returns: the album table configuration or None on failure.

def process_artist_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

174    def process_artist_table(self) -> Optional[DatasetTableConfig]:
175        """Process the artist table.
176
177        Extends the table with artist gender and genres information when available.
178
179        Returns:
180            the artist table configuration or None on failure.
181        """
182        artist_table_config = create_dataset_table_config(
183            'LFM-1b_artists.txt',
184            ['artist_id'],
185            ['artist_name']
186        )
187
188        try:
189            artist_table = artist_table_config.read_table(self.dataset_dir)
190        except FileNotFoundError:
191            artist_table = pd.DataFrame()
192            artist_table_config.columns.pop()
193
194        # add artist gender when available
195        gender_table = self.load_artist_gender_json()
196        if gender_table is not None:
197            # replace artist table when missing
198            if len(artist_table) == 0:
199                artist_table = gender_table
200            # merge artist table with gender
201            else:
202                artist_table = pd.merge(artist_table, gender_table, how='left', on='artist_id')
203            artist_table_config.columns += ['artist_gender']
204
205        # no need to continue if the previous failed
206        if len(artist_table) == 0:
207            return None
208
209        if 'artist_name' in artist_table_config.columns:
210            # attempt to load artist name / genre table
211            artist_genres = self.load_artist_genres_allmusic()
212            if artist_genres is not None:
213                # merge artist table with genres
214                artist_table = pd.merge(artist_table, artist_genres, how='left', on='artist_name')
215                artist_table_config.columns += ['artist_genres']
216
217        artist_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_artists.tsv.bz2'
218        artist_table_config.file.options.compression = 'bz2'
219        artist_table_config.num_records = len(artist_table)
220
221        # store generated artist table
222        artist_table_config.save_table(artist_table, self.dataset_dir)
223
224        return artist_table_config

Process the artist table.

Extends the table with artist gender and genres information when available.

Returns: the artist table configuration or None on failure.

def process_genres_allmusic( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

226    def process_genres_allmusic(self) -> Optional[DatasetTableConfig]:
227        """Process the allmusic genres table.
228
229        Returns:
230            the allmusic genres table configuration or None on failure.
231        """
232        genres_allmusic_table_config = create_dataset_table_config(
233            'genres_allmusic.txt',
234            [], # row number is the primary key
235            ['allmusic_genre']
236        )
237        try:
238            genres_allmusic_table = genres_allmusic_table_config.read_table(self.dataset_dir)
239        except FileNotFoundError:
240            return None
241
242        # reset index and rename to primary key
243        genres_allmusic_table.reset_index(inplace=True)
244        genres_allmusic_table.rename(columns={0: 'allmusic_id'}, inplace=True)
245
246        genres_allmusic_table_config.primary_key = ['allmusic_id']
247        genres_allmusic_table_config.file.name = \
248            TABLE_FILE_PREFIX + self.dataset_name + '_genres_allmusic.tsv.bz2'
249        genres_allmusic_table_config.file.options.compression = 'bz2'
250        genres_allmusic_table_config.num_records = len(genres_allmusic_table)
251
252        # store generated allmusic genre table
253        genres_allmusic_table_config.save_table(genres_allmusic_table, self.dataset_dir)
254
255        return genres_allmusic_table_config

Process the allmusic genres table.

Returns: the allmusic genres table configuration or None on failure.

def process_track_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

258    def process_track_table(self) -> Optional[DatasetTableConfig]:
259        """Process the track table.
260
261        Returns:
262            the track table configuration or None on failure.
263        """
264        track_table_config = create_dataset_table_config(
265            'LFM-1b_tracks.txt',
266            ['track_id'],
267            ['track_name'],
268            foreign_keys=['artist_id']
269        )
270
271        try:
272            num_records = len(track_table_config.read_table(self.dataset_dir))
273            track_table_config.num_records = num_records
274            return track_table_config
275        except FileNotFoundError:
276            return None

Process the track table.

Returns: the track table configuration or None on failure.

def process_user_artist_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]: View Source

278    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
279        """Process the user-artist-count matrix.
280
281        The user-item matrix is stored in a matlab file in CSR compatible format,
282        together with the user and item indices. The matrix is converted
283        to a dataframe and the indices for the indirection arrays are flattened.
284
285        Returns:
286            the matrix configuration or None on failure.
287        """
288        try:
289            mat_file = os.path.join(self.dataset_dir, 'LFM-1b_LEs.mat')
290            # load matrix as described in the paper
291            csr_matrix, idx_users, idx_artists = _load_lfm_1b_mat(mat_file)
292        except FileNotFoundError:
293            return None
294
295        matrix_name = 'user-artist-count'
296
297        # create and save user indirection array
298        user_list = list(map(lambda i: i[0], idx_users))
299        user_index_config = DatasetIndexConfig(
300            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_user_indices.hdf5',
301            'user_id',
302            len(user_list)
303        )
304        user_index_config.save_indices(self.dataset_dir, user_list)
305
306        # create and save artist indirection array
307        artist_list = list(map(lambda i: i[0], idx_artists))
308        artist_index_config = DatasetIndexConfig(
309            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_item_indices.hdf5',
310            'artist_id',
311            len(artist_list)
312        )
313        artist_index_config.save_indices(self.dataset_dir, artist_list)
314
315        # convert csr to dataframe
316        coo_matrix = pd.DataFrame.sparse.from_spmatrix(csr_matrix).sparse.to_coo()
317        user_artist_matrix = pd.DataFrame()
318        user_artist_matrix['user_id'] = coo_matrix.row
319        user_artist_matrix['artist_id'] = coo_matrix.col
320        user_artist_matrix['matrix_count'] = coo_matrix.data
321
322        # create matrix table configuration
323        user_artist_table_config = create_dataset_table_config(
324            TABLE_FILE_PREFIX + self.dataset_name + '_' + matrix_name + '_matrix.tsv.bz2',
325            ['user_id', 'artist_id'],
326            ['matrix_count'],
327            compression='bz2',
328            foreign_keys=['user_id', 'artist_id'],
329            num_records=len(user_artist_matrix)
330        )
331
332        # store the resulting matrix
333        user_artist_table_config.save_table(user_artist_matrix, self.dataset_dir)
334
335        return DatasetMatrixConfig(
336            user_artist_table_config,
337            RatingMatrixConfig(
338                user_artist_matrix['matrix_count'].min(),
339                user_artist_matrix['matrix_count'].max(),
340                DATASET_RATINGS_IMPLICIT
341            ),
342            user_index_config,
343            artist_index_config
344        )

Process the user-artist-count matrix.

The user-item matrix is stored in a matlab file in CSR compatible format, together with the user and item indices. The matrix is converted to a dataframe and the indices for the indirection arrays are flattened.

Returns: the matrix configuration or None on failure.

def process_user_additional_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

346    def process_user_additional_table(self) -> Optional[DatasetTableConfig]:
347        """Process the user additional table.
348
349        Returns:
350            the user additional table configuration or None on failure.
351        """
352        columns = [
353            'user_novelty artist avg month',
354            'user_novelty artist avg 6months',
355            'user_novelty artist avg year',
356            'user_mainstreaminess avg month',
357            'user_mainstreaminess avg 6months',
358            'user_mainstreaminess avg year',
359            'user_mainstreaminess global',
360            'user_count LEs',
361            'user_count distinct tracks',
362            'user_count distinct artists',
363            'user_count LEs per week'
364        ]
365
366        for i in range(1, 8):
367            columns += ['user_relative LE per weekday' + str(i)]
368        for i in range(0, 24):
369            columns += ['user_relative LE per hour' + str(i)]
370
371        user_additional_table_config = create_dataset_table_config(
372            'LFM-1b_users_additional.txt',
373            ['user_id'],
374            columns,
375            header=True
376        )
377
378        try:
379            num_records = len(user_additional_table_config.read_table(self.dataset_dir))
380            user_additional_table_config.num_records = num_records
381            return user_additional_table_config
382        except FileNotFoundError:
383            return None

Process the user additional table.

Returns: the user additional table configuration or None on failure.

def process_user_genre_allmusic_no_pc( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

385    def process_user_genre_allmusic_no_pc(self) -> Optional[DatasetTableConfig]:
386        """Process the user allmusic genre table.
387
388        Returns:
389            the user allmusic genre table configuration or None on failure.
390        """
391        columns = []
392        for genre_name in ALL_MUSIC_GENRES:
393            columns += ['noPC_' + genre_name]
394
395        user_genre_allmusic_no_pc_config = create_dataset_table_config(
396            'LFM-1b_UGP_noPC_allmusic.txt',
397            ['user_id'],
398            columns,
399            header=True
400        )
401        try:
402            num_records = len(user_genre_allmusic_no_pc_config.read_table(self.dataset_dir))
403            user_genre_allmusic_no_pc_config.num_records = num_records
404            return user_genre_allmusic_no_pc_config
405        except FileNotFoundError:
406            return None

Process the user allmusic genre table.

Returns: the user allmusic genre table configuration or None on failure.

def process_user_genre_allmusic_weighted_pc( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

408    def process_user_genre_allmusic_weighted_pc(self) -> Optional[DatasetTableConfig]:
409        """Process the user allmusic genre table with weighted play count.
410
411        Returns:
412            the user allmusic genre table configuration or None on failure.
413        """
414        columns = []
415        for genre_name in ALL_MUSIC_GENRES:
416            columns += ['weightedPC_' + genre_name]
417
418        user_genre_allmusic_weighted_pc_config = create_dataset_table_config(
419            'LFM-1b_UGP_noPC_allmusic.txt',
420            ['user_id'],
421            columns,
422            header=True
423        )
424        try:
425            num_records = len(user_genre_allmusic_weighted_pc_config.read_table(self.dataset_dir))
426            user_genre_allmusic_weighted_pc_config.num_records = num_records
427            return user_genre_allmusic_weighted_pc_config
428        except FileNotFoundError:
429            return None

Process the user allmusic genre table with weighted play count.

Returns: the user allmusic genre table configuration or None on failure.

src.fairreckitlib.data.set.processor.dataset_processor_lfm1b

Inherited Members