src.fairreckitlib.data.filter.count_filter

Module to filter on count threshold. Can be used to filter on countries with many users.

Classes:

CountFilter: Filter the dataframe on a column, such as country.
    Show only those above a certain threshold.

Functions:

create_count_filter: Create an instance of CountFilter.

This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences)

 1"""Module to filter on count threshold. Can be used to filter on countries with many users.
 2
 3Classes:
 4
 5    CountFilter: Filter the dataframe on a column, such as country.
 6        Show only those above a certain threshold.
 7
 8Functions:
 9
10    create_count_filter: Create an instance of CountFilter.
11
12
13This program has been developed by students from the bachelor Computer Science at
14Utrecht University within the Software Project course.
15© Copyright Utrecht University (Department of Information and Computing Sciences)
16"""
17
18from typing import Any, Dict
19import pandas as pd
20from .base_filter import DataFilter
21from .filter_constants import FILTER_COUNT
22
23class CountFilter(DataFilter):
24    """Filter the dataframe on a column, and select only whose count is above a given threshold.
25
26    Public method:
27        filter
28    """
29
30    def get_type(self) -> str:
31        """Get the type of the filter.
32
33        Returns:
34            The type name of the filter.
35        """
36        return FILTER_COUNT
37
38    def filter(self, dataframe: pd.DataFrame,
39               column_name: str='', threshold: int=1) -> pd.DataFrame:
40        """Filter out the values in column_name which count is below threshold.
41
42        Args:
43            dataframe: Dataframe to be filtered.
44            column_name: Name of the column.
45            threshold:
46                Values above or equal to the threshold will be included in the resulting dataframe.
47
48        Returns:
49            A filtered dataframe.
50        """
51        if column_name not in dataframe.columns:
52            return self.__empty_df__(dataframe)
53        value_counts = dataframe[column_name].value_counts(dropna=False)
54        key_dict = (value_counts >= threshold).to_dict()
55        df_filter = dataframe[column_name].replace(key_dict)
56        return dataframe[df_filter].reset_index(drop=True)
57
58    def _filter(self, dataframe: pd.DataFrame) -> pd.DataFrame:
59        """Private filter used in run(). Requires configuration file."""
60        return self.filter(dataframe,
61                           self.get_name()[:-len(('_' + FILTER_COUNT))],
62                           self.params['threshold'])
63
64
65def create_count_filter(name: str, params: Dict[str, Any], **kwargs) -> CountFilter:
66    """Create an instance of the class CountFilter.
67
68    Args:
69        name: Name of the filter.
70        params: Configuration file.
71        **kwargs: Contains dataset and matrix_name.
72
73    Returns:
74        An instance of the CountFilter class.
75    """
76    return CountFilter(name, params, **kwargs)
24class CountFilter(DataFilter):
25    """Filter the dataframe on a column, and select only whose count is above a given threshold.
26
27    Public method:
28        filter
29    """
30
31    def get_type(self) -> str:
32        """Get the type of the filter.
33
34        Returns:
35            The type name of the filter.
36        """
37        return FILTER_COUNT
38
39    def filter(self, dataframe: pd.DataFrame,
40               column_name: str='', threshold: int=1) -> pd.DataFrame:
41        """Filter out the values in column_name which count is below threshold.
42
43        Args:
44            dataframe: Dataframe to be filtered.
45            column_name: Name of the column.
46            threshold:
47                Values above or equal to the threshold will be included in the resulting dataframe.
48
49        Returns:
50            A filtered dataframe.
51        """
52        if column_name not in dataframe.columns:
53            return self.__empty_df__(dataframe)
54        value_counts = dataframe[column_name].value_counts(dropna=False)
55        key_dict = (value_counts >= threshold).to_dict()
56        df_filter = dataframe[column_name].replace(key_dict)
57        return dataframe[df_filter].reset_index(drop=True)
58
59    def _filter(self, dataframe: pd.DataFrame) -> pd.DataFrame:
60        """Private filter used in run(). Requires configuration file."""
61        return self.filter(dataframe,
62                           self.get_name()[:-len(('_' + FILTER_COUNT))],
63                           self.params['threshold'])

Filter the dataframe on a column, and select only whose count is above a given threshold.

Public method: filter

def get_type(self) -> str:
31    def get_type(self) -> str:
32        """Get the type of the filter.
33
34        Returns:
35            The type name of the filter.
36        """
37        return FILTER_COUNT

Get the type of the filter.

Returns: The type name of the filter.

def filter( self, dataframe: pandas.core.frame.DataFrame, column_name: str = '', threshold: int = 1) -> pandas.core.frame.DataFrame:
39    def filter(self, dataframe: pd.DataFrame,
40               column_name: str='', threshold: int=1) -> pd.DataFrame:
41        """Filter out the values in column_name which count is below threshold.
42
43        Args:
44            dataframe: Dataframe to be filtered.
45            column_name: Name of the column.
46            threshold:
47                Values above or equal to the threshold will be included in the resulting dataframe.
48
49        Returns:
50            A filtered dataframe.
51        """
52        if column_name not in dataframe.columns:
53            return self.__empty_df__(dataframe)
54        value_counts = dataframe[column_name].value_counts(dropna=False)
55        key_dict = (value_counts >= threshold).to_dict()
56        df_filter = dataframe[column_name].replace(key_dict)
57        return dataframe[df_filter].reset_index(drop=True)

Filter out the values in column_name which count is below threshold.

Args: dataframe: Dataframe to be filtered. column_name: Name of the column. threshold: Values above or equal to the threshold will be included in the resulting dataframe.

Returns: A filtered dataframe.

def create_count_filter( name: str, params: Dict[str, Any], **kwargs) -> src.fairreckitlib.data.filter.count_filter.CountFilter:
66def create_count_filter(name: str, params: Dict[str, Any], **kwargs) -> CountFilter:
67    """Create an instance of the class CountFilter.
68
69    Args:
70        name: Name of the filter.
71        params: Configuration file.
72        **kwargs: Contains dataset and matrix_name.
73
74    Returns:
75        An instance of the CountFilter class.
76    """
77    return CountFilter(name, params, **kwargs)

Create an instance of the class CountFilter.

Args: name: Name of the filter. params: Configuration file. **kwargs: Contains dataset and matrix_name.

Returns: An instance of the CountFilter class.