src.fairreckitlib.data.set.processor.dataset_processor_ml

This module contains the base processor for MovieLens datasets.

Classes:

DatasetProcessorML: the base class for MovieLens dataset processors.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

  1"""This module contains the base processor for MovieLens datasets.
  2
  3Classes:
  4
  5    DatasetProcessorML: the base class for MovieLens dataset processors.
  6
  7This program has been developed by students from the bachelor Computer Science at
  8Utrecht University within the Software Project course.
  9© Copyright Utrecht University (Department of Information and Computing Sciences)
 10"""
 11
 12from abc import ABCMeta, abstractmethod
 13from typing import Callable, List, Optional, Tuple
 14
 15import numpy as np
 16
 17from ..dataset_config import DATASET_RATINGS_EXPLICIT, RatingMatrixConfig
 18from ..dataset_config import DatasetIndexConfig, DatasetMatrixConfig, DatasetTableConfig
 19from ..dataset_constants import TABLE_FILE_PREFIX
 20from .dataset_processor_base import DatasetProcessorBase
 21
 22
 23class DatasetProcessorML(DatasetProcessorBase, metaclass=ABCMeta):
 24    """DataProcessor base class for MovieLens datasets.
 25
 26    Provides an abstraction for processing the user-movie-rating matrix.
 27    Moreover, it is assumed that the datasets do not have any event tables.
 28
 29    Abstract methods:
 30
 31    create_user_movie_matrix_config
 32    """
 33
 34    @abstractmethod
 35    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 36        """Create the user-movie matrix configuration.
 37
 38        Returns:
 39            the table configuration of the matrix.
 40        """
 41        raise NotImplementedError()
 42
 43    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 44        """Get event table configuration processors.
 45
 46        Returns:
 47            an empty list.
 48        """
 49        return []
 50
 51    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 52        """Get matrix configuration processors.
 53
 54        Returns:
 55            a list with the user-movie-rating matrix processor.
 56        """
 57        return [('user-movie-rating', self.process_user_movie_matrix)]
 58
 59    def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]:
 60        """Process the user-movie-rating matrix.
 61
 62        Returns:
 63            the matrix configuration or None on failure.
 64        """
 65        user_movie_matrix_table_config = self.create_user_movie_matrix_config()
 66
 67        # extract column names from configuration
 68        user_id = user_movie_matrix_table_config.primary_key[0]
 69        item_id = user_movie_matrix_table_config.primary_key[1]
 70        rating_column = user_movie_matrix_table_config.columns[0]
 71
 72        try:
 73            user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir)
 74            user_movie_matrix_table_config.num_records = len(user_movie_matrix)
 75        except FileNotFoundError:
 76            return None
 77
 78        if user_movie_matrix[rating_column].dtype == np.int64:
 79            # convert int ratings
 80            user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float)
 81
 82            # update matrix configuration
 83            user_movie_matrix_table_config.file.name = \
 84                TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2'
 85            user_movie_matrix_table_config.file.options.sep = None
 86            user_movie_matrix_table_config.file.options.compression = 'bz2'
 87            user_movie_matrix_table_config.file.options.header = False
 88
 89            # store resulting matrix
 90            user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir)
 91
 92        return DatasetMatrixConfig(
 93            user_movie_matrix_table_config,
 94            RatingMatrixConfig(
 95                float(user_movie_matrix[rating_column].min()),
 96                float(user_movie_matrix[rating_column].max()),
 97                DATASET_RATINGS_EXPLICIT
 98            ),
 99            DatasetIndexConfig(
100                None,
101                user_id,
102                len(user_movie_matrix[user_id].unique())
103            ),
104            DatasetIndexConfig(
105                None,
106                item_id,
107                len(user_movie_matrix[item_id].unique())
108            )
109        )
 24class DatasetProcessorML(DatasetProcessorBase, metaclass=ABCMeta):
 25    """DataProcessor base class for MovieLens datasets.
 26
 27    Provides an abstraction for processing the user-movie-rating matrix.
 28    Moreover, it is assumed that the datasets do not have any event tables.
 29
 30    Abstract methods:
 31
 32    create_user_movie_matrix_config
 33    """
 34
 35    @abstractmethod
 36    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
 37        """Create the user-movie matrix configuration.
 38
 39        Returns:
 40            the table configuration of the matrix.
 41        """
 42        raise NotImplementedError()
 43
 44    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
 45        """Get event table configuration processors.
 46
 47        Returns:
 48            an empty list.
 49        """
 50        return []
 51
 52    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
 53        """Get matrix configuration processors.
 54
 55        Returns:
 56            a list with the user-movie-rating matrix processor.
 57        """
 58        return [('user-movie-rating', self.process_user_movie_matrix)]
 59
 60    def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]:
 61        """Process the user-movie-rating matrix.
 62
 63        Returns:
 64            the matrix configuration or None on failure.
 65        """
 66        user_movie_matrix_table_config = self.create_user_movie_matrix_config()
 67
 68        # extract column names from configuration
 69        user_id = user_movie_matrix_table_config.primary_key[0]
 70        item_id = user_movie_matrix_table_config.primary_key[1]
 71        rating_column = user_movie_matrix_table_config.columns[0]
 72
 73        try:
 74            user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir)
 75            user_movie_matrix_table_config.num_records = len(user_movie_matrix)
 76        except FileNotFoundError:
 77            return None
 78
 79        if user_movie_matrix[rating_column].dtype == np.int64:
 80            # convert int ratings
 81            user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float)
 82
 83            # update matrix configuration
 84            user_movie_matrix_table_config.file.name = \
 85                TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2'
 86            user_movie_matrix_table_config.file.options.sep = None
 87            user_movie_matrix_table_config.file.options.compression = 'bz2'
 88            user_movie_matrix_table_config.file.options.header = False
 89
 90            # store resulting matrix
 91            user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir)
 92
 93        return DatasetMatrixConfig(
 94            user_movie_matrix_table_config,
 95            RatingMatrixConfig(
 96                float(user_movie_matrix[rating_column].min()),
 97                float(user_movie_matrix[rating_column].max()),
 98                DATASET_RATINGS_EXPLICIT
 99            ),
100            DatasetIndexConfig(
101                None,
102                user_id,
103                len(user_movie_matrix[user_id].unique())
104            ),
105            DatasetIndexConfig(
106                None,
107                item_id,
108                len(user_movie_matrix[item_id].unique())
109            )
110        )

DataProcessor base class for MovieLens datasets.

Provides an abstraction for processing the user-movie-rating matrix. Moreover, it is assumed that the datasets do not have any event tables.

Abstract methods:

create_user_movie_matrix_config

@abstractmethod
def create_user_movie_matrix_config(self) -> src.fairreckitlib.data.set.dataset_config.DatasetTableConfig:
35    @abstractmethod
36    def create_user_movie_matrix_config(self) -> DatasetTableConfig:
37        """Create the user-movie matrix configuration.
38
39        Returns:
40            the table configuration of the matrix.
41        """
42        raise NotImplementedError()

Create the user-movie matrix configuration.

Returns: the table configuration of the matrix.

def get_event_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetTableConfig]]]]:
44    def get_event_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetTableConfig]]]]:
45        """Get event table configuration processors.
46
47        Returns:
48            an empty list.
49        """
50        return []

Get event table configuration processors.

Returns: an empty list.

def get_matrix_configs( self) -> List[Tuple[str, Callable[[], Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]]]]:
52    def get_matrix_configs(self) -> List[Tuple[str, Callable[[], Optional[DatasetMatrixConfig]]]]:
53        """Get matrix configuration processors.
54
55        Returns:
56            a list with the user-movie-rating matrix processor.
57        """
58        return [('user-movie-rating', self.process_user_movie_matrix)]

Get matrix configuration processors.

Returns: a list with the user-movie-rating matrix processor.

def process_user_movie_matrix( self) -> Optional[src.fairreckitlib.data.set.dataset_config.DatasetMatrixConfig]:
 60    def process_user_movie_matrix(self) -> Optional[DatasetMatrixConfig]:
 61        """Process the user-movie-rating matrix.
 62
 63        Returns:
 64            the matrix configuration or None on failure.
 65        """
 66        user_movie_matrix_table_config = self.create_user_movie_matrix_config()
 67
 68        # extract column names from configuration
 69        user_id = user_movie_matrix_table_config.primary_key[0]
 70        item_id = user_movie_matrix_table_config.primary_key[1]
 71        rating_column = user_movie_matrix_table_config.columns[0]
 72
 73        try:
 74            user_movie_matrix = user_movie_matrix_table_config.read_table(self.dataset_dir)
 75            user_movie_matrix_table_config.num_records = len(user_movie_matrix)
 76        except FileNotFoundError:
 77            return None
 78
 79        if user_movie_matrix[rating_column].dtype == np.int64:
 80            # convert int ratings
 81            user_movie_matrix[rating_column] = user_movie_matrix[rating_column].astype(float)
 82
 83            # update matrix configuration
 84            user_movie_matrix_table_config.file.name = \
 85                TABLE_FILE_PREFIX + self.dataset_name + '_user-movie-rating_matrix.tsv.bz2'
 86            user_movie_matrix_table_config.file.options.sep = None
 87            user_movie_matrix_table_config.file.options.compression = 'bz2'
 88            user_movie_matrix_table_config.file.options.header = False
 89
 90            # store resulting matrix
 91            user_movie_matrix_table_config.save_table(user_movie_matrix, self.dataset_dir)
 92
 93        return DatasetMatrixConfig(
 94            user_movie_matrix_table_config,
 95            RatingMatrixConfig(
 96                float(user_movie_matrix[rating_column].min()),
 97                float(user_movie_matrix[rating_column].max()),
 98                DATASET_RATINGS_EXPLICIT
 99            ),
100            DatasetIndexConfig(
101                None,
102                user_id,
103                len(user_movie_matrix[user_id].unique())
104            ),
105            DatasetIndexConfig(
106                None,
107                item_id,
108                len(user_movie_matrix[item_id].unique())
109            )
110        )

Process the user-movie-rating matrix.

Returns: the matrix configuration or None on failure.