src.fairreckitlib.data.set.processor.dataset_processor

def create_listening_events_config( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

42    def create_listening_events_config(self) -> Optional[DatasetTableConfig]:
43        """Create the listening event table configuration.
44
45        Returns:
46            the configuration of the listening event table.
47        """
48        return create_dataset_table_config(
49            'listening-events.tsv.bz2',
50            ['user_id', 'track_id', 'album_id'],
51            ['timestamp'],
52            compression='bz2',
53            header=True
54        )

Create the listening event table configuration.

Returns: the configuration of the listening event table.

def create_user_table_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig: View Source

56    def create_user_table_config(self) -> DatasetTableConfig:
57        """Create the user table configuration.
58
59        Returns:
60            the configuration of the user table.
61        """
62        return create_dataset_table_config(
63            'users.tsv.bz2',
64            ['user_id'],
65            ['user_country', 'user_age', 'user_gender', 'user_creation time'],
66            header=True,
67            compression='bz2'
68        )

Create the user table configuration.

Returns: the configuration of the user table.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]: View Source

70    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
71        """Get matrix configuration processors.
72
73        Returns:
74            a list containing the user-artist-count and user-track-count matrix processors.
75        """
76        return [
77            ('user-artist-count', self.process_user_artist_matrix),
78            ('user-track-count', self.process_user_track_matrix)
79        ]

Get matrix configuration processors.

Returns: a list containing the user-artist-count and user-track-count matrix processors.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]: View Source

81    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
82        """Get table configuration processors.
83
84        Returns:
85            a list containing the album, artist, spotify, track and user table processors.
86        """
87        return DatasetProcessorLFM.get_table_configs(self) + [
88            ('album', self.process_album_table),
89            ('artist', self.process_artist_table),
90            ('spotify', self.process_spotify_table),
91            ('track', self.process_track_table)
92        ]

Get table configuration processors.

Returns: a list containing the album, artist, spotify, track and user table processors.

def process_album_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

 94    def process_album_table(self) -> Optional[DatasetTableConfig]:
 95        r"""Process the album table.
 96
 97        The original file does not load correctly with pandas when splitting on
 98        newlines \n and \t tabs.
 99
100        Returns:
101            the album table configuration or None on failure.
102        """
103        try:
104            file_name, num_records = self.process_corrupt_table('albums')
105        except FileNotFoundError:
106            return None
107
108        return create_dataset_table_config(
109            file_name,
110            ['album_id'],
111            ['album_name', 'artist_name'],
112            compression='bz2',
113            num_records=num_records
114        )

Process the album table.

The original file does not load correctly with pandas when splitting on newlines \n and \t tabs.

Returns: the album table configuration or None on failure.

def process_artist_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

116    def process_artist_table(self) -> Optional[DatasetTableConfig]:
117        """Process the artist table.
118
119        Returns:
120            the artist table configuration or None on failure.
121        """
122        artist_table_config =  create_dataset_table_config(
123            'artists.tsv.bz2',
124            ['artist_id'],
125            ['artist_name'],
126            header=True,
127            compression='bz2'
128        )
129
130        try:
131            artist_table = artist_table_config.read_table(self.dataset_dir)
132            artist_table_config.num_records = len(artist_table)
133            return artist_table_config
134        except FileNotFoundError:
135            return None

Process the artist table.

Returns: the artist table configuration or None on failure.

def process_corrupt_table(self, table_name: str) -> Tuple[str, int]: View Source

137    def process_corrupt_table(self, table_name: str) -> Tuple[str, int]:
138        """Process a corrupt table that does not load correctly with pandas.
139
140        Loading with the 'python-fwf' engine does not have issues, however the
141        row values need to be manually split.
142        """
143        table_iterator = pd.read_table(
144            os.path.join(self.dataset_dir, table_name + '.tsv.bz2'),
145            header=0,
146            encoding='utf-8',
147            engine='python-fwf',
148            names=['fwf'],
149            iterator=True,
150            chunksize=1000000
151        )
152
153        file_name = TABLE_FILE_PREFIX + self.dataset_name + '_' + table_name + '.tsv.bz2'
154        file_path = os.path.join(self.dataset_dir, file_name)
155        # remove existing file when present
156        if os.path.isfile(file_path):
157            os.remove(file_path)
158
159        num_records = 0
160        # process in chunks as splitting manually uses a lot of memory
161        for _, dataframe in enumerate(table_iterator):
162            dataframe = dataframe['fwf'].str.split('\t', expand=True)
163            dataframe.to_csv(
164                file_path,
165                mode='a',
166                sep='\t',
167                index=False,
168                header=False,
169                compression='bz2'
170            )
171            num_records += len(dataframe)
172
173        return file_name, num_records

Process a corrupt table that does not load correctly with pandas.

Loading with the 'python-fwf' engine does not have issues, however the row values need to be manually split.

def process_spotify_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

175    def process_spotify_table(self) -> Optional[DatasetTableConfig]:
176        """Process the spotify table.
177
178        Returns:
179            the spotify table configuration or None on failure.
180        """
181        spotify_table_config =  create_dataset_table_config(
182            'spotify-uris.tsv.bz2',
183            ['track_id'],
184            ['track_spotify-uri'],
185            header=True,
186            compression='bz2'
187        )
188
189        try:
190            spotify_table = spotify_table_config.read_table(self.dataset_dir)
191            spotify_table_config.num_records = len(spotify_table)
192            return spotify_table_config
193        except FileNotFoundError:
194            return None

Process the spotify table.

Returns: the spotify table configuration or None on failure.

def process_track_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]: View Source

196    def process_track_table(self) -> Optional[DatasetTableConfig]:
197        r"""Process the track table.
198
199        The original file does not load correctly with pandas when splitting on
200        newlines \n and \t tabs.
201
202        Returns:
203            the track table configuration or None on failure.
204        """
205        try:
206            file_name, num_records = self.process_corrupt_table('tracks')
207        except FileNotFoundError:
208            return None
209
210        return create_dataset_table_config(
211            file_name,
212            ['track_id'],
213            ['artist_name', 'track_name'],
214            compression='bz2',
215            num_records=num_records
216        )

Process the track table.

The original file does not load correctly with pandas when splitting on newlines \n and \t tabs.

Returns: the track table configuration or None on failure.

def process_user_artist_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]: View Source

218    def process_user_artist_matrix(self) -> Optional[DatasetMatrixConfig]:
219        """Process the user-artist-count matrix.
220
221        Returns:
222            the matrix configuration or None on failure.
223        """
224        return self.process_matrix(create_dataset_table_config(
225            'user_artist_playcount.tsv',
226            ['user_id', 'artist_id'],
227            ['matrix_count'],
228            foreign_keys=['user_id', 'artist_id']
229        ))

Process the user-artist-count matrix.

Returns: the matrix configuration or None on failure.

def process_user_track_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]: View Source

231    def process_user_track_matrix(self) -> Optional[DatasetMatrixConfig]:
232        """Process the user-track-count matrix.
233
234        Returns:
235            the matrix configuration or None on failure.
236        """
237        return self.process_matrix(create_dataset_table_config(
238            'listening-counts.tsv.bz2',
239            ['user_id', 'track_id'],
240            ['matrix_count'],
241            foreign_keys=['user_id', 'track_id'],
242            compression='bz2',
243            header=True
244        ))

Process the user-track-count matrix.

Returns: the matrix configuration or None on failure.

src.fairreckitlib.data.set.processor.dataset_processor_lfm2b

Inherited Members