Source code for libmultilabel.linear.preprocessor
from __future__ import annotations
import logging
from collections import defaultdict
from scipy import sparse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
__all__ = ["Preprocessor"]
[docs]class Preprocessor:
"""Preprocessor is used to preprocess input data in LibSVM or LibMultiLabel formats.
The same Preprocessor has to be used for both training and test datasets;
see save_pipeline and load_pipeline for more details.
"""
[docs] def __init__(
self, include_test_labels: bool = False, remove_no_label_data: bool = False, tfidf_params: dict[str, str] = {}
):
"""Initializes the preprocessor.
Args:
include_test_labels (bool, optional): Whether to include labels in the test dataset. Defaults to False.
remove_no_label_data (bool, optional): Whether to remove training instances that have no labels.
Defaults to False.
tfidf_params (dict[str, str], optional): A set of parameters for sklearn.TfidfVectorizer. If empty, default
parameters will be used.
"""
self.include_test_labels = include_test_labels
self.remove_no_label_data = remove_no_label_data
self.tfidf_params = tfidf_params
self.data_format = None
self.vectorizer = None
self.binarizer = None
self.label_mapping = None
self.is_fitted = False
[docs] def fit(self, dataset: dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]) -> Preprocessor:
"""Fit the preprocessor according to the training and test datasets, and pre-defined labels if given.
Args:
dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
The training and test datasets along with possibly pre-defined labels with keys 'train', 'test', and
"labels" respectively. The dataset must have keys 'x' for input features, and 'y' for actual labels. It
also contains 'data_format' to indicate the data format used.
Returns:
Preprocessor: An instance of the fitted preprocessor.
"""
if self.is_fitted:
raise AttributeError("Preprocessor has been fitted. An instance of Preprocessor can only been fitted once.")
self.is_fitted = True
self.data_format = dataset["data_format"]
# learn vocabulary and idf from training dataset
if self.data_format in {"txt", "dataframe"}:
self.vectorizer = TfidfVectorizer(**self.tfidf_params)
self.vectorizer.fit(dataset["train"]["x"])
# learn label mapping from training and test datasets
self.binarizer = MultiLabelBinarizer(classes=dataset.get("classes"), sparse_output=True)
if not self.include_test_labels:
self.binarizer.fit(dataset["train"]["y"])
else:
self.binarizer.fit(dataset["train"]["y"] + dataset["test"]["y"])
self.label_mapping = self.binarizer.classes_
return self
[docs] def transform(self, dataset: dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
"""Convert x and y in the training and test datasets according to the fitted preprocessor.
Args:
dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
The training and test datasets along with labels with keys 'train', 'test', and labels respectively.
The dataset has keys 'x' for input features and 'y' for labels. It also contains 'data_format' to indicate
the data format used.
Returns:
dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
"""
if not self.is_fitted:
raise AttributeError("Preprecessor has not been fitted.")
# "tf" indicates transformed
dataset_tf = defaultdict(dict)
dataset_tf["data_format"] = dataset["data_format"]
if "classes" in dataset:
dataset_tf["classes"] = dataset["classes"]
# transform a collection of raw text to a matrix of TF-IDF features
if {self.data_format, dataset["data_format"]}.issubset({"txt", "dataframe"}):
try:
if "train" in dataset:
dataset_tf["train"]["x"] = self.vectorizer.transform(dataset["train"]["x"])
if "test" in dataset:
dataset_tf["test"]["x"] = self.vectorizer.transform(dataset["test"]["x"])
except AttributeError:
raise AttributeError("Tfidf vectorizer has not been fitted.")
else:
if "train" in dataset:
dataset_tf["train"]["x"] = dataset["train"]["x"]
if "test" in dataset:
dataset_tf["test"]["x"] = dataset["test"]["x"]
# transform a collection of raw labels to a binary matrix
if "train" in dataset:
dataset_tf["train"]["y"] = self.binarizer.transform(dataset["train"]["y"]).astype("d")
if "test" in dataset:
dataset_tf["test"]["y"] = self.binarizer.transform(dataset["test"]["y"]).astype("d")
# remove data points with no labels
if "train" in dataset_tf:
num_labels = dataset_tf["train"]["y"].getnnz(axis=1)
num_no_label_data = np.count_nonzero(num_labels == 0)
if num_no_label_data > 0:
if self.remove_no_label_data:
logging.info(
f"Remove {num_no_label_data} instances that have no labels in the dataset.",
extra={"collect": True},
)
dataset_tf["train"]["x"] = dataset_tf["train"]["x"][num_labels > 0]
dataset_tf["train"]["y"] = dataset_tf["train"]["y"][num_labels > 0]
else:
logging.info(
f"Keep {num_no_label_data} instances that have no labels in the dataset.",
extra={"collect": True},
)
return dict(dataset_tf)
[docs] def fit_transform(self, dataset):
"""Fit the preprocessor according to the training and test datasets, and pre-defined labels if given.
Then convert x and y in the training and test datasets according to the fitted preprocessor.
Args:
dataset (dict[str, dict[str, sparse.csr_matrix | list[list[int]] | list[str]]]):
The training and test datasets along with labels with keys 'train', 'test', and labels respectively.
The dataset has keys 'x' for input features and 'y' for labels. It also contains 'data_format' to
indicate the data format used.
Returns:
dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
"""
return self.fit(dataset).transform(dataset)