src.fairreckitlib.data.set.processor.dataset_processor_ml25m

This modules contains the class to process the MovieLens-25M dataset.

Classes:

DatasetProcessorML25M: data processor implementation for the ML-25M dataset.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This modules contains the class to process the MovieLens-25M dataset.
  2
  3Classes:
  4
  5    DatasetProcessorML25M: data processor implementation for the ML-25M dataset.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12from typing import Callable, List, Optional, Tuple
 13
 14import pandas as pd
 15
 16from ..dataset_config import DatasetTableConfig
 17from ..dataset_config import create_dataset_table_config
 18from ..dataset_constants import TABLE_FILE_PREFIX
 19from .dataset_processor_ml import DatasetProcessorML
 20
 21
 22class DatasetProcessorML25M(DatasetProcessorML):
 23    """DatasetProcessor for the MovieLens-25M dataset.
 24
 25    The dataset can be downloaded from the link below.
 26    https://files.grouplens.org/datasets/movielens/ml-25m.zip
 27
 28    The processor handles the following files:
 29
 30    genome-scores.csv (optional)
 31    genome-tags.csv (optional)
 32    links.csv (optional)
 33    movies.csv (optional)
 34    ratings.csv (required)
 35    tags.csv (optional)
 36    """
 37
 38    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 39        """Create the user-movie matrix configuration.
 40
 41        Returns:
 42            the table configuration of the ML-25M matrix.
 43        """
 44        return create_dataset_table_config(
 45            'ratings.csv',
 46            ['user_id', 'movie_id'],
 47            ['matrix_rating', 'matrix_timestamp'],
 48            foreign_keys=['user_id', 'movie_id'],
 49            header=True,
 50            sep=','
 51        )
 52
 53    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 54        """Get event table configuration processors.
 55
 56        Returns:
 57            a list containing the tag event table processor.
 58        """
 59        return [('tag', self.process_tag_table)]
 60
 61    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 62        """Get table configuration processors.
 63
 64        Returns:
 65            a list containing the genome score, genome tag, movie and tag table processors.
 66        """
 67        return [
 68            ('genome score', self.process_genome_score_table),
 69            ('genome tag', self.process_genome_tag_table),
 70            ('movie', self.process_movie_table)
 71        ]
 72
 73    def process_genome_score_table(self) -> Optional[DatasetTableConfig]:
 74        """Process the genome score table.
 75
 76        Returns:
 77            the genome score table configuration or None on failure.
 78        """
 79        genome_score_table_config = create_dataset_table_config(
 80            'genome-scores.csv',
 81            ['movie_id', 'tag_id'],
 82            ['movie-tag_relevance'],
 83            foreign_keys=['movie_id', 'tag_id'],
 84            header=True,
 85            sep=','
 86        )
 87
 88        try:
 89            genome_score_table = genome_score_table_config.read_table(self.dataset_dir)
 90            genome_score_table_config.num_records = len(genome_score_table)
 91            return genome_score_table_config
 92        except FileNotFoundError:
 93            return None
 94
 95    def process_genome_tag_table(self) -> Optional[DatasetTableConfig]:
 96        """Process the genome tag table.
 97
 98        Returns:
 99            the genome tag table configuration or None on failure.
100        """
101        genome_tag_table_config = create_dataset_table_config(
102            'genome-tags.csv',
103            ['tag_id'],
104            ['tag_name'],
105            header=True,
106            sep=','
107        )
108
109        try:
110            genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir)
111            genome_tag_table_config.num_records = len(genome_tag_table)
112            return genome_tag_table_config
113        except FileNotFoundError:
114            return None
115
116    def process_movie_table(self) -> Optional[DatasetTableConfig]:
117        """Process the movie table.
118
119        The movie and link tables are joined together for simplification.
120
121        Returns:
122            the movie table configuration or None on failure.
123        """
124        link_table_config = create_dataset_table_config(
125            'links.csv',
126            ['movie_id'],
127            ['movie_imdbID', 'movie_tmdbID'],
128            header=True,
129            sep=','
130        )
131
132        try:
133            link_table = link_table_config.read_table(self.dataset_dir)
134            link_table_config.num_records = len(link_table)
135            # replace NaN and cast back to original int
136            link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int)
137        except FileNotFoundError:
138            link_table = None
139
140        movie_table_config = create_dataset_table_config(
141            'movies.csv',
142            ['movie_id'],
143            ['movie_title', 'movie_genres'],
144            header=True,
145            sep=','
146        )
147
148        try:
149            movie_table = movie_table_config.read_table(self.dataset_dir)
150            movie_table_config.num_records = len(movie_table)
151        except FileNotFoundError:
152            return link_table_config if link_table is not None else None
153
154        if link_table is not None:
155            # merge movie and link tables
156            movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id')
157
158            # update movie table configuration
159            movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
160            movie_table_config.file.options.sep = None
161            movie_table_config.file.options.compression = 'bz2'
162            movie_table_config.file.options.header = False
163            movie_table_config.columns += link_table_config.columns
164
165            # store the extended movie table
166            movie_table_config.save_table(movie_table, self.dataset_dir)
167
168        return movie_table_config
169
170    def process_tag_table(self) -> Optional[DatasetTableConfig]:
171        """Process the tag (event) table.
172
173        Returns:
174            the tag table configuration or None on failure.
175        """
176        tag_table_config = create_dataset_table_config(
177            'tags.csv',
178            ['user_id', 'movie_id'],
179            ['tag_name','tag_timestamp'],
180            foreign_keys=['user_id', 'movie_id'],
181            header=True,
182            sep=','
183        )
184
185        try:
186            tag_table = tag_table_config.read_table(self.dataset_dir)
187            tag_table_config.num_records = len(tag_table)
188            return tag_table_config
189        except FileNotFoundError:
190            return None
 23class DatasetProcessorML25M(DatasetProcessorML):
 24    """DatasetProcessor for the MovieLens-25M dataset.
 25
 26    The dataset can be downloaded from the link below.
 27    https://files.grouplens.org/datasets/movielens/ml-25m.zip
 28
 29    The processor handles the following files:
 30
 31    genome-scores.csv (optional)
 32    genome-tags.csv (optional)
 33    links.csv (optional)
 34    movies.csv (optional)
 35    ratings.csv (required)
 36    tags.csv (optional)
 37    """
 38
 39    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 40        """Create the user-movie matrix configuration.
 41
 42        Returns:
 43            the table configuration of the ML-25M matrix.
 44        """
 45        return create_dataset_table_config(
 46            'ratings.csv',
 47            ['user_id', 'movie_id'],
 48            ['matrix_rating', 'matrix_timestamp'],
 49            foreign_keys=['user_id', 'movie_id'],
 50            header=True,
 51            sep=','
 52        )
 53
 54    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 55        """Get event table configuration processors.
 56
 57        Returns:
 58            a list containing the tag event table processor.
 59        """
 60        return [('tag', self.process_tag_table)]
 61
 62    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 63        """Get table configuration processors.
 64
 65        Returns:
 66            a list containing the genome score, genome tag, movie and tag table processors.
 67        """
 68        return [
 69            ('genome score', self.process_genome_score_table),
 70            ('genome tag', self.process_genome_tag_table),
 71            ('movie', self.process_movie_table)
 72        ]
 73
 74    def process_genome_score_table(self) -> Optional[DatasetTableConfig]:
 75        """Process the genome score table.
 76
 77        Returns:
 78            the genome score table configuration or None on failure.
 79        """
 80        genome_score_table_config = create_dataset_table_config(
 81            'genome-scores.csv',
 82            ['movie_id', 'tag_id'],
 83            ['movie-tag_relevance'],
 84            foreign_keys=['movie_id', 'tag_id'],
 85            header=True,
 86            sep=','
 87        )
 88
 89        try:
 90            genome_score_table = genome_score_table_config.read_table(self.dataset_dir)
 91            genome_score_table_config.num_records = len(genome_score_table)
 92            return genome_score_table_config
 93        except FileNotFoundError:
 94            return None
 95
 96    def process_genome_tag_table(self) -> Optional[DatasetTableConfig]:
 97        """Process the genome tag table.
 98
 99        Returns:
100            the genome tag table configuration or None on failure.
101        """
102        genome_tag_table_config = create_dataset_table_config(
103            'genome-tags.csv',
104            ['tag_id'],
105            ['tag_name'],
106            header=True,
107            sep=','
108        )
109
110        try:
111            genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir)
112            genome_tag_table_config.num_records = len(genome_tag_table)
113            return genome_tag_table_config
114        except FileNotFoundError:
115            return None
116
117    def process_movie_table(self) -> Optional[DatasetTableConfig]:
118        """Process the movie table.
119
120        The movie and link tables are joined together for simplification.
121
122        Returns:
123            the movie table configuration or None on failure.
124        """
125        link_table_config = create_dataset_table_config(
126            'links.csv',
127            ['movie_id'],
128            ['movie_imdbID', 'movie_tmdbID'],
129            header=True,
130            sep=','
131        )
132
133        try:
134            link_table = link_table_config.read_table(self.dataset_dir)
135            link_table_config.num_records = len(link_table)
136            # replace NaN and cast back to original int
137            link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int)
138        except FileNotFoundError:
139            link_table = None
140
141        movie_table_config = create_dataset_table_config(
142            'movies.csv',
143            ['movie_id'],
144            ['movie_title', 'movie_genres'],
145            header=True,
146            sep=','
147        )
148
149        try:
150            movie_table = movie_table_config.read_table(self.dataset_dir)
151            movie_table_config.num_records = len(movie_table)
152        except FileNotFoundError:
153            return link_table_config if link_table is not None else None
154
155        if link_table is not None:
156            # merge movie and link tables
157            movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id')
158
159            # update movie table configuration
160            movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
161            movie_table_config.file.options.sep = None
162            movie_table_config.file.options.compression = 'bz2'
163            movie_table_config.file.options.header = False
164            movie_table_config.columns += link_table_config.columns
165
166            # store the extended movie table
167            movie_table_config.save_table(movie_table, self.dataset_dir)
168
169        return movie_table_config
170
171    def process_tag_table(self) -> Optional[DatasetTableConfig]:
172        """Process the tag (event) table.
173
174        Returns:
175            the tag table configuration or None on failure.
176        """
177        tag_table_config = create_dataset_table_config(
178            'tags.csv',
179            ['user_id', 'movie_id'],
180            ['tag_name','tag_timestamp'],
181            foreign_keys=['user_id', 'movie_id'],
182            header=True,
183            sep=','
184        )
185
186        try:
187            tag_table = tag_table_config.read_table(self.dataset_dir)
188            tag_table_config.num_records = len(tag_table)
189            return tag_table_config
190        except FileNotFoundError:
191            return None

DatasetProcessor for the MovieLens-25M dataset.

The dataset can be downloaded from the link below. https://files.grouplens.org/datasets/movielens/ml-25m.zip

The processor handles the following files:

genome-scores.csv (optional) genome-tags.csv (optional) links.csv (optional) movies.csv (optional) ratings.csv (required) tags.csv (optional)

def create_user_movie_matrix_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
39    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
40        """Create the user-movie matrix configuration.
41
42        Returns:
43            the table configuration of the ML-25M matrix.
44        """
45        return create_dataset_table_config(
46            'ratings.csv',
47            ['user_id', 'movie_id'],
48            ['matrix_rating', 'matrix_timestamp'],
49            foreign_keys=['user_id', 'movie_id'],
50            header=True,
51            sep=','
52        )

Create the user-movie matrix configuration.

Returns: the table configuration of the ML-25M matrix.

def get_event_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
54    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
55        """Get event table configuration processors.
56
57        Returns:
58            a list containing the tag event table processor.
59        """
60        return [('tag', self.process_tag_table)]

Get event table configuration processors.

Returns: a list containing the tag event table processor.

def get_table_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
62    def get_table_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
63        """Get table configuration processors.
64
65        Returns:
66            a list containing the genome score, genome tag, movie and tag table processors.
67        """
68        return [
69            ('genome score', self.process_genome_score_table),
70            ('genome tag', self.process_genome_tag_table),
71            ('movie', self.process_movie_table)
72        ]

Get table configuration processors.

Returns: a list containing the genome score, genome tag, movie and tag table processors.

def process_genome_score_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
74    def process_genome_score_table(self) -> Optional[DatasetTableConfig]:
75        """Process the genome score table.
76
77        Returns:
78            the genome score table configuration or None on failure.
79        """
80        genome_score_table_config = create_dataset_table_config(
81            'genome-scores.csv',
82            ['movie_id', 'tag_id'],
83            ['movie-tag_relevance'],
84            foreign_keys=['movie_id', 'tag_id'],
85            header=True,
86            sep=','
87        )
88
89        try:
90            genome_score_table = genome_score_table_config.read_table(self.dataset_dir)
91            genome_score_table_config.num_records = len(genome_score_table)
92            return genome_score_table_config
93        except FileNotFoundError:
94            return None

Process the genome score table.

Returns: the genome score table configuration or None on failure.

def process_genome_tag_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
 96    def process_genome_tag_table(self) -> Optional[DatasetTableConfig]:
 97        """Process the genome tag table.
 98
 99        Returns:
100            the genome tag table configuration or None on failure.
101        """
102        genome_tag_table_config = create_dataset_table_config(
103            'genome-tags.csv',
104            ['tag_id'],
105            ['tag_name'],
106            header=True,
107            sep=','
108        )
109
110        try:
111            genome_tag_table = genome_tag_table_config.read_table(self.dataset_dir)
112            genome_tag_table_config.num_records = len(genome_tag_table)
113            return genome_tag_table_config
114        except FileNotFoundError:
115            return None

Process the genome tag table.

Returns: the genome tag table configuration or None on failure.

def process_movie_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
117    def process_movie_table(self) -> Optional[DatasetTableConfig]:
118        """Process the movie table.
119
120        The movie and link tables are joined together for simplification.
121
122        Returns:
123            the movie table configuration or None on failure.
124        """
125        link_table_config = create_dataset_table_config(
126            'links.csv',
127            ['movie_id'],
128            ['movie_imdbID', 'movie_tmdbID'],
129            header=True,
130            sep=','
131        )
132
133        try:
134            link_table = link_table_config.read_table(self.dataset_dir)
135            link_table_config.num_records = len(link_table)
136            # replace NaN and cast back to original int
137            link_table['movie_tmdbID'] = link_table['movie_tmdbID'].fillna(-1.0).astype(int)
138        except FileNotFoundError:
139            link_table = None
140
141        movie_table_config = create_dataset_table_config(
142            'movies.csv',
143            ['movie_id'],
144            ['movie_title', 'movie_genres'],
145            header=True,
146            sep=','
147        )
148
149        try:
150            movie_table = movie_table_config.read_table(self.dataset_dir)
151            movie_table_config.num_records = len(movie_table)
152        except FileNotFoundError:
153            return link_table_config if link_table is not None else None
154
155        if link_table is not None:
156            # merge movie and link tables
157            movie_table = pd.merge(movie_table, link_table, how='left', on='movie_id')
158
159            # update movie table configuration
160            movie_table_config.file.name = TABLE_FILE_PREFIX + self.dataset_name + '_movies.tsv.bz2'
161            movie_table_config.file.options.sep = None
162            movie_table_config.file.options.compression = 'bz2'
163            movie_table_config.file.options.header = False
164            movie_table_config.columns += link_table_config.columns
165
166            # store the extended movie table
167            movie_table_config.save_table(movie_table, self.dataset_dir)
168
169        return movie_table_config

Process the movie table.

The movie and link tables are joined together for simplification.

Returns: the movie table configuration or None on failure.

def process_tag_table( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]:
171    def process_tag_table(self) -> Optional[DatasetTableConfig]:
172        """Process the tag (event) table.
173
174        Returns:
175            the tag table configuration or None on failure.
176        """
177        tag_table_config = create_dataset_table_config(
178            'tags.csv',
179            ['user_id', 'movie_id'],
180            ['tag_name','tag_timestamp'],
181            foreign_keys=['user_id', 'movie_id'],
182            header=True,
183            sep=','
184        )
185
186        try:
187            tag_table = tag_table_config.read_table(self.dataset_dir)
188            tag_table_config.num_records = len(tag_table)
189            return tag_table_config
190        except FileNotFoundError:
191            return None

Process the tag (event) table.

Returns: the tag table configuration or None on failure.