Source code for libmultilabel.linear.utils

from __future__ import annotations

import os
import pathlib
import pickle
import re
from typing import Any

import numpy as np
import scipy.sparse as sparse
import sklearn.base
import sklearn.model_selection
import sklearn.pipeline
import sklearn.utils

import libmultilabel.linear as linear

from .preprocessor import Preprocessor

__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV"]


LINEAR_TECHNIQUES = {
    "1vsrest": linear.train_1vsrest,
    "thresholding": linear.train_thresholding,
    "cost_sensitive": linear.train_cost_sensitive,
    "cost_sensitive_micro": linear.train_cost_sensitive_micro,
    "binary_and_multiclass": linear.train_binary_and_multiclass,
    "tree": linear.train_tree,
}


[docs]def save_pipeline(checkpoint_dir: str, preprocessor: Preprocessor, model): """Saves preprocessor and model to checkpoint_dir/linear_pipline.pickle. Args: checkpoint_dir (str): The directory to save to. preprocessor (Preprocessor): A Preprocessor. model: A model returned from one of the training functions. """ pathlib.Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) checkpoint_path = os.path.join(checkpoint_dir, "linear_pipeline.pickle") with open(checkpoint_path, "wb") as f: pickle.dump( { "preprocessor": preprocessor, "model": model, }, f, protocol=pickle.HIGHEST_PROTOCOL, )
[docs]def load_pipeline(checkpoint_path: str) -> tuple[Preprocessor, Any]: """Loads preprocessor and model from checkpoint_path. Args: checkpoint_path (str): The path to a previously saved pipeline. Returns: tuple[Preprocessor, Any]: A tuple of the preprocessor and model. """ with open(checkpoint_path, "rb") as f: pipeline = pickle.load(f) return (pipeline["preprocessor"], pipeline["model"])
[docs]class MultiLabelEstimator(sklearn.base.BaseEstimator): """Customized sklearn estimator for the multi-label classifier. Args: options (str, optional): The option string passed to liblinear. Defaults to ''. linear_technique (str, optional): Multi-label technique defined in `utils.LINEAR_TECHNIQUES`. Defaults to '1vsrest'. scoring_metric (str, optional): The scoring metric. Defaults to 'P@1'. """
[docs] def __init__(self, options: str = "", linear_technique: str = "1vsrest", scoring_metric: str = "P@1"): super().__init__() self.options = options self.linear_technique = linear_technique self.scoring_metric = scoring_metric self._is_fitted = False
[docs] def fit(self, X: sparse.csr_matrix, y: sparse.csr_matrix): X, y = sklearn.utils.validation.check_X_y(X, y, accept_sparse=True, multi_output=True) self._is_fitted = True self.model = LINEAR_TECHNIQUES[self.linear_technique](y, X, self.options) return self
[docs] def predict(self, X: sparse.csr_matrix) -> np.ndarray: sklearn.utils.validation.check_is_fitted(self, attributes=["_is_fitted"]) preds = linear.predict_values(self.model, X) return preds
[docs] def score(self, X: sparse.csr_matrix, y: sparse.csr_matrix) -> float: metrics = linear.get_metrics( [self.scoring_metric], y.shape[1], ) preds = self.predict(X) metrics.update(preds, y.toarray()) metric_dict = metrics.compute() return metric_dict[self.scoring_metric]
[docs]class GridSearchCV(sklearn.model_selection.GridSearchCV): """A customized `sklearn.model_selection.GridSearchCV`` class for Liblinear. The usage is similar to sklearn's, except that the parameter ``scoring`` is unavailable. Instead, specify ``scoring_metric`` in ``MultiLabelEstimator`` in the Pipeline. Args: estimator (estimator object): An estimator for grid search. param_grid (dict): Search space for a grid search containing a dictionary of parameters and their corresponding list of candidate values. n_jobs (int, optional): Number of CPU cores run in parallel. Defaults to None. """ _required_parameters = ["estimator", "param_grid"]
[docs] def __init__(self, estimator, param_grid: dict, n_jobs=None, **kwargs): if n_jobs is not None and n_jobs > 1: param_grid = self._set_singlecore_options(estimator, param_grid) if "scoring" in kwargs.keys(): raise ValueError( "Please specify the validation metric with `MultiLabelEstimator.scoring_metric` in the Pipeline instead of using the parameter `scoring`." ) super().__init__(estimator=estimator, n_jobs=n_jobs, param_grid=param_grid, **kwargs)
def _set_singlecore_options(self, estimator, param_grid: dict): """Set liblinear options to `-m 1`. The grid search option `n_jobs` runs multiple processes in parallel. Using multithreaded liblinear in conjunction with grid search oversubscribes the CPU and deteriorates the performance significantly. """ params = estimator.get_params() for name, transform in params.items(): if isinstance(transform, MultiLabelEstimator): regex = r"-m \d+" key = f"{name}__options" param_grid[key] = [f"{re.sub(regex, '', v)} -m 1" for v in param_grid[key]] return param_grid