Source code for libmultilabel.linear.preprocessor

from __future__ import annotations

import logging
from collections import defaultdict
from scipy import sparse

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

__all__ = ["Preprocessor"]


[docs]class Preprocessor:
    """Preprocessor is used to preprocess input data in LibSVM or LibMultiLabel formats.
    The same Preprocessor has to be used for both training and test datasets;
    see save_pipeline and load_pipeline for more details.
    """

[docs]    def __init__(
        self, include_test_labels: bool = False, remove_no_label_data: bool = False, tfidf_params: dict[str, str] = {}
    ):
        """Initializes the preprocessor.

        Args:
            include_test_labels (bool, optional): Whether to include labels in the test dataset. Defaults to False.
            remove_no_label_data (bool, optional): Whether to remove training instances that have no labels.
                Defaults to False.
            tfidf_params (dict[str, str], optional): A set of parameters for sklearn.TfidfVectorizer. If empty, default
                parameters will be used.
        """
        self.include_test_labels = include_test_labels
        self.remove_no_label_data = remove_no_label_data
        self.tfidf_params = tfidf_params
        self.data_format = None
        self.vectorizer = None
        self.binarizer = None
        self.label_mapping = None
        self.is_fitted = False

[docs]    def fit(self, dataset: dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]) -> Preprocessor:
        """Fit the preprocessor according to the training and test datasets, and pre-defined labels if given.

        Args:
            dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
                The training and test datasets along with possibly pre-defined labels with keys 'train', 'test', and
                "labels" respectively. The dataset must have keys 'x' for input features, and 'y' for actual labels. It
                also contains 'data_format' to indicate the data format used.

        Returns:
            Preprocessor: An instance of the fitted preprocessor.
        """
        if self.is_fitted:
            raise AttributeError("Preprocessor has been fitted. An instance of Preprocessor can only been fitted once.")
        self.is_fitted = True

        self.data_format = dataset["data_format"]
        # learn vocabulary and idf from training dataset
        if self.data_format in {"txt", "dataframe"}:
            self.vectorizer = TfidfVectorizer(**self.tfidf_params)
            self.vectorizer.fit(dataset["train"]["x"])

        # learn label mapping from training and test datasets
        self.binarizer = MultiLabelBinarizer(classes=dataset.get("classes"), sparse_output=True)
        if not self.include_test_labels:
            self.binarizer.fit(dataset["train"]["y"])
        else:
            self.binarizer.fit(dataset["train"]["y"] + dataset["test"]["y"])
        self.label_mapping = self.binarizer.classes_
        return self

[docs]    def transform(self, dataset: dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
        """Convert x and y in the training and test datasets according to the fitted preprocessor.

        Args:
            dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
                The training and test datasets along with labels with keys 'train', 'test', and labels respectively.
                The dataset has keys 'x' for input features and 'y' for labels. It also contains 'data_format' to indicate
                the data format used.

        Returns:
            dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
        """
        if not self.is_fitted:
            raise AttributeError("Preprecessor has not been fitted.")

        # "tf" indicates transformed
        dataset_tf = defaultdict(dict)
        dataset_tf["data_format"] = dataset["data_format"]
        if "classes" in dataset:
            dataset_tf["classes"] = dataset["classes"]
        # transform a collection of raw text to a matrix of TF-IDF features
        if {self.data_format, dataset["data_format"]}.issubset({"txt", "dataframe"}):
            try:
                if "train" in dataset:
                    dataset_tf["train"]["x"] = self.vectorizer.transform(dataset["train"]["x"])
                if "test" in dataset:
                    dataset_tf["test"]["x"] = self.vectorizer.transform(dataset["test"]["x"])
            except AttributeError:
                raise AttributeError("Tfidf vectorizer has not been fitted.")
        else:
            if "train" in dataset:
                dataset_tf["train"]["x"] = dataset["train"]["x"]
            if "test" in dataset:
                dataset_tf["test"]["x"] = dataset["test"]["x"]

        # transform a collection of raw labels to a binary matrix
        if "train" in dataset:
            dataset_tf["train"]["y"] = self.binarizer.transform(dataset["train"]["y"]).astype("d")
        if "test" in dataset:
            dataset_tf["test"]["y"] = self.binarizer.transform(dataset["test"]["y"]).astype("d")

        # remove data points with no labels
        if "train" in dataset_tf:
            num_labels = dataset_tf["train"]["y"].getnnz(axis=1)
            num_no_label_data = np.count_nonzero(num_labels == 0)
            if num_no_label_data > 0:
                if self.remove_no_label_data:
                    logging.info(
                        f"Remove {num_no_label_data} instances that have no labels in the dataset.",
                        extra={"collect": True},
                    )
                    dataset_tf["train"]["x"] = dataset_tf["train"]["x"][num_labels > 0]
                    dataset_tf["train"]["y"] = dataset_tf["train"]["y"][num_labels > 0]
                else:
                    logging.info(
                        f"Keep {num_no_label_data} instances that have no labels in the dataset.",
                        extra={"collect": True},
                    )

        return dict(dataset_tf)

[docs]    def fit_transform(self, dataset):
        """Fit the preprocessor according to the training and test datasets, and pre-defined labels if given.
        Then convert x and y in the training and test datasets according to the fitted preprocessor.

        Args:
            dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
                The training and test datasets along with labels with keys 'train', 'test', and labels respectively.
                The dataset has keys 'x' for input features and 'y' for labels. It also contains 'data_format' to
                indicate the data format used.

        Returns:
            dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
        """
        return self.fit(dataset).transform(dataset)