src.fairreckitlib.data.set.processor.dataset_processor_ml100k

This modules contains the class to process the MovieLens-100K dataset.

Classes:

DatasetProcessorML100K: data processor implementation for the ML-100K dataset.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This modules contains the class to process the MovieLens-100K dataset.
  2
  3Classes:
  4
  5    DatasetProcessorML100K: data processor implementation for the ML-100K dataset.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12from typing import Callable, List, Optional, Tuple
 13
 14import numpy as np
 15
 16from ..dataset_config import DatasetTableConfig
 17from ..dataset_config import create_dataset_table_config
 18from ..dataset_constants import TABLE_FILE_PREFIX
 19from .dataset_processor_ml import DatasetProcessorML
 20
 21MOVIE_GENRES = [
 22    'Unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
 23    'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
 24    'Thriller', 'War', 'Western'
 25]
 26
 27
 28class DatasetProcessorML100K(DatasetProcessorML):
 29    """DatasetProcessor for the MovieLens-100K dataset.
 30
 31    The dataset can be downloaded from the link below.
 32    https://files.grouplens.org/datasets/movielens/ml-100k.zip
 33
 34    The processor handles the following files:
 35
 36    u.data (required)
 37    u.user (optional)
 38    u.item (optional)
 39    """
 40
 41    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 42        """Create the user-movie matrix configuration.
 43
 44        Returns:
 45            the table configuration of the ML-100K matrix.
 46        """
 47        return create_dataset_table_config(
 48            'u.data',
 49            ['user_id', 'movie_id'],
 50            ['matrix_rating', 'matrix_timestamp'],
 51            foreign_keys=['user_id', 'movie_id']
 52        )
 53
 54    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 55        """Get table configuration processors.
 56
 57        Returns:
 58            a list containing the user and movie table processors.
 59        """
 60        return [
 61            ('movie', self.process_movie_table),
 62            ('user', self.process_user_table)
 63        ]
 64
 65    def process_movie_table(self) -> Optional[DatasetTableConfig]:
 66        """Process the movie table.
 67
 68        Removes an empty release date column that is included in the movie title.
 69        Simplifies the binary genre columns by concatenating the names using pipes.
 70
 71        Returns:
 72            the movie table configuration or None on failure.
 73        """
 74        movie_columns = [
 75            'movie_title',
 76            'movie_release date',
 77            'empty', # this column does not contain any data
 78            'movie_imdb url'
 79        ]
 80
 81        # create original table definition
 82        movie_table_config = create_dataset_table_config(
 83            'u.item',
 84            ['movie_id'],
 85            movie_columns + MOVIE_GENRES,
 86            sep='|',
 87            encoding='ISO-8859-1'
 88        )
 89
 90        try:
 91            # read the original table without binary genres
 92            movie_table = movie_table_config.read_table(
 93                self.dataset_dir,
 94                columns=movie_table_config.primary_key + movie_columns
 95            )
 96            # read the binary genres table
 97            genres_table = movie_table_config.read_table(
 98                self.dataset_dir,
 99                columns=MOVIE_GENRES
100            )
101        except FileNotFoundError:
102            return None
103
104        # drop and remove the empty column
105        movie_columns.remove('empty')
106        movie_table.drop('empty', axis=1, inplace=True)
107        movie_table = movie_table[movie_table_config.primary_key + movie_columns]
108
109        # replace 0 with NaN and 1 with the corresponding genre
110        for column_name in genres_table:
111            genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True)
112
113        # collapse genres into one column and add it to the original table
114        genre_column = 'movie_genres'
115        movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1)
116
117        # update movie table definition
118        movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
119        movie_table_config.file.options.compression = 'bz2'
120        movie_table_config.file.options.sep = None
121        movie_table_config.columns = movie_columns + [genre_column]
122        movie_table_config.num_records = len(movie_table)
123
124        # store the generated movie table
125        movie_table_config.save_table(movie_table, self.dataset_dir)
126
127        return movie_table_config
128
129    def process_user_table(self) -> Optional[DatasetTableConfig]:
130        """Process the user table.
131
132        Changes the contents of the gender and occupation columns to be more user-friendly.
133
134        Returns:
135            the user table configuration or None on failure.
136        """
137        user_table_config = create_dataset_table_config(
138            'u.user',
139            ['user_id'],
140            ['user_age', 'user_gender', 'user_occupation', 'user_zip code'],
141            sep='|'
142        )
143
144        try:
145            user_table = user_table_config.read_table(self.dataset_dir)
146            user_table_config.num_records=len(user_table)
147        except FileNotFoundError:
148            return None
149
150        # convert gender and occupation to more user-friendly names
151        user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True)
152        user_table['user_occupation'] = user_table['user_occupation'].str.capitalize()
153
154        # update user table configuration
155        user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2'
156        user_table_config.file.options.compression = 'bz2'
157        user_table_config.file.options.sep = None
158
159        # store the generated user table
160        user_table_config.save_table(user_table, self.dataset_dir)
161
162        return user_table_config
 29class DatasetProcessorML100K(DatasetProcessorML):
 30    """DatasetProcessor for the MovieLens-100K dataset.
 31
 32    The dataset can be downloaded from the link below.
 33    https://files.grouplens.org/datasets/movielens/ml-100k.zip
 34
 35    The processor handles the following files:
 36
 37    u.data (required)
 38    u.user (optional)
 39    u.item (optional)
 40    """
 41
 42    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 43        """Create the user-movie matrix configuration.
 44
 45        Returns:
 46            the table configuration of the ML-100K matrix.
 47        """
 48        return create_dataset_table_config(
 49            'u.data',
 50            ['user_id', 'movie_id'],
 51            ['matrix_rating', 'matrix_timestamp'],
 52            foreign_keys=['user_id', 'movie_id']
 53        )
 54
 55    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 56        """Get table configuration processors.
 57
 58        Returns:
 59            a list containing the user and movie table processors.
 60        """
 61        return [
 62            ('movie', self.process_movie_table),
 63            ('user', self.process_user_table)
 64        ]
 65
 66    def process_movie_table(self) -> Optional[DatasetTableConfig]:
 67        """Process the movie table.
 68
 69        Removes an empty release date column that is included in the movie title.
 70        Simplifies the binary genre columns by concatenating the names using pipes.
 71
 72        Returns:
 73            the movie table configuration or None on failure.
 74        """
 75        movie_columns = [
 76            'movie_title',
 77            'movie_release date',
 78            'empty', # this column does not contain any data
 79            'movie_imdb url'
 80        ]
 81
 82        # create original table definition
 83        movie_table_config = create_dataset_table_config(
 84            'u.item',
 85            ['movie_id'],
 86            movie_columns + MOVIE_GENRES,
 87            sep='|',
 88            encoding='ISO-8859-1'
 89        )
 90
 91        try:
 92            # read the original table without binary genres
 93            movie_table = movie_table_config.read_table(
 94                self.dataset_dir,
 95                columns=movie_table_config.primary_key + movie_columns
 96            )
 97            # read the binary genres table
 98            genres_table = movie_table_config.read_table(
 99                self.dataset_dir,
100                columns=MOVIE_GENRES
101            )
102        except FileNotFoundError:
103            return None
104
105        # drop and remove the empty column
106        movie_columns.remove('empty')
107        movie_table.drop('empty', axis=1, inplace=True)
108        movie_table = movie_table[movie_table_config.primary_key + movie_columns]
109
110        # replace 0 with NaN and 1 with the corresponding genre
111        for column_name in genres_table:
112            genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True)
113
114        # collapse genres into one column and add it to the original table
115        genre_column = 'movie_genres'
116        movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1)
117
118        # update movie table definition
119        movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
120        movie_table_config.file.options.compression = 'bz2'
121        movie_table_config.file.options.sep = None
122        movie_table_config.columns = movie_columns + [genre_column]
123        movie_table_config.num_records = len(movie_table)
124
125        # store the generated movie table
126        movie_table_config.save_table(movie_table, self.dataset_dir)
127
128        return movie_table_config
129
130    def process_user_table(self) -> Optional[DatasetTableConfig]:
131        """Process the user table.
132
133        Changes the contents of the gender and occupation columns to be more user-friendly.
134
135        Returns:
136            the user table configuration or None on failure.
137        """
138        user_table_config = create_dataset_table_config(
139            'u.user',
140            ['user_id'],
141            ['user_age', 'user_gender', 'user_occupation', 'user_zip code'],
142            sep='|'
143        )
144
145        try:
146            user_table = user_table_config.read_table(self.dataset_dir)
147            user_table_config.num_records=len(user_table)
148        except FileNotFoundError:
149            return None
150
151        # convert gender and occupation to more user-friendly names
152        user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True)
153        user_table['user_occupation'] = user_table['user_occupation'].str.capitalize()
154
155        # update user table configuration
156        user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2'
157        user_table_config.file.options.compression = 'bz2'
158        user_table_config.file.options.sep = None
159
160        # store the generated user table
161        user_table_config.save_table(user_table, self.dataset_dir)
162
163        return user_table_config

DatasetProcessor for the MovieLens-100K dataset.

The dataset can be downloaded from the link below. https://files.grouplens.org/datasets/movielens/ml-100k.zip

The processor handles the following files:

u.data (required) u.user (optional) u.item (optional)

def create_user_movie_matrix_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
42    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
43        """Create the user-movie matrix configuration.
44
45        Returns:
46            the table configuration of the ML-100K matrix.
47        """
48        return create_dataset_table_config(
49            'u.data',
50            ['user_id', 'movie_id'],
51            ['matrix_rating', 'matrix_timestamp'],
52            foreign_keys=['user_id', 'movie_id']
53        )

Create the user-movie matrix configuration.

Returns: the table configuration of the ML-100K matrix.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
55    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
56        """Get table configuration processors.
57
58        Returns:
59            a list containing the user and movie table processors.
60        """
61        return [
62            ('movie', self.process_movie_table),
63            ('user', self.process_user_table)
64        ]

Get table configuration processors.

Returns: a list containing the user and movie table processors.

def process_movie_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
 66    def process_movie_table(self) -> Optional[DatasetTableConfig]:
 67        """Process the movie table.
 68
 69        Removes an empty release date column that is included in the movie title.
 70        Simplifies the binary genre columns by concatenating the names using pipes.
 71
 72        Returns:
 73            the movie table configuration or None on failure.
 74        """
 75        movie_columns = [
 76            'movie_title',
 77            'movie_release date',
 78            'empty', # this column does not contain any data
 79            'movie_imdb url'
 80        ]
 81
 82        # create original table definition
 83        movie_table_config = create_dataset_table_config(
 84            'u.item',
 85            ['movie_id'],
 86            movie_columns + MOVIE_GENRES,
 87            sep='|',
 88            encoding='ISO-8859-1'
 89        )
 90
 91        try:
 92            # read the original table without binary genres
 93            movie_table = movie_table_config.read_table(
 94                self.dataset_dir,
 95                columns=movie_table_config.primary_key + movie_columns
 96            )
 97            # read the binary genres table
 98            genres_table = movie_table_config.read_table(
 99                self.dataset_dir,
100                columns=MOVIE_GENRES
101            )
102        except FileNotFoundError:
103            return None
104
105        # drop and remove the empty column
106        movie_columns.remove('empty')
107        movie_table.drop('empty', axis=1, inplace=True)
108        movie_table = movie_table[movie_table_config.primary_key + movie_columns]
109
110        # replace 0 with NaN and 1 with the corresponding genre
111        for column_name in genres_table:
112            genres_table[column_name].replace({1:column_name, 0: np.nan}, inplace=True)
113
114        # collapse genres into one column and add it to the original table
115        genre_column = 'movie_genres'
116        movie_table[genre_column] = genres_table.apply(lambda x: x.str.cat(sep='|'), axis=1)
117
118        # update movie table definition
119        movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
120        movie_table_config.file.options.compression = 'bz2'
121        movie_table_config.file.options.sep = None
122        movie_table_config.columns = movie_columns + [genre_column]
123        movie_table_config.num_records = len(movie_table)
124
125        # store the generated movie table
126        movie_table_config.save_table(movie_table, self.dataset_dir)
127
128        return movie_table_config

Process the movie table.

Removes an empty release date column that is included in the movie title. Simplifies the binary genre columns by concatenating the names using pipes.

Returns: the movie table configuration or None on failure.

def process_user_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
130    def process_user_table(self) -> Optional[DatasetTableConfig]:
131        """Process the user table.
132
133        Changes the contents of the gender and occupation columns to be more user-friendly.
134
135        Returns:
136            the user table configuration or None on failure.
137        """
138        user_table_config = create_dataset_table_config(
139            'u.user',
140            ['user_id'],
141            ['user_age', 'user_gender', 'user_occupation', 'user_zip code'],
142            sep='|'
143        )
144
145        try:
146            user_table = user_table_config.read_table(self.dataset_dir)
147            user_table_config.num_records=len(user_table)
148        except FileNotFoundError:
149            return None
150
151        # convert gender and occupation to more user-friendly names
152        user_table['user_gender'].replace({'M': 'Male', 'F': 'Female'}, inplace=True)
153        user_table['user_occupation'] = user_table['user_occupation'].str.capitalize()
154
155        # update user table configuration
156        user_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_users.tsv.bz2'
157        user_table_config.file.options.compression = 'bz2'
158        user_table_config.file.options.sep = None
159
160        # store the generated user table
161        user_table_config.save_table(user_table, self.dataset_dir)
162
163        return user_table_config

Process the user table.

Changes the contents of the gender and occupation columns to be more user-friendly.

Returns: the user table configuration or None on failure.