Calculate sklearn.roc_auc_score for multi-class
You can't use roc_auc
as a single summary metric for multiclass models. If you want, you could calculate per-class roc_auc
, as
roc = {label: [] for label in multi_class_series.unique()}for label in multi_class_series.unique(): selected_classifier.fit(train_set_dataframe, train_class == label) predictions_proba = selected_classifier.predict_proba(test_set_dataframe) roc[label] += roc_auc_score(test_class, predictions_proba[:,1])
However it's more usual to use sklearn.metrics.confusion_matrix
to evaluate the performance of a multiclass model.
The average
option of roc_auc_score
is only defined for multilabel problems.
You can take a look at the following example from the scikit-learn documentation to define you own micro- or macro-averaged scores for multiclass problems:
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#multiclass-settings
Edit: there is an issue on the scikit-learn tracker to implement ROC AUC for multiclass problems: https://github.com/scikit-learn/scikit-learn/issues/3298
As mentioned in here, to the best of my knowledge there is not yet a way to easily compute roc auc for multiple class settings natively in sklearn.
However, if you are familiar with classification_report
you may like this simple implementation that returns the same output as classification_report
as a pandas.DataFrame
which I personally found it very handy!:
import pandas as pdimport numpy as npfrom scipy import interpfrom sklearn.metrics import precision_recall_fscore_supportfrom sklearn.metrics import roc_curve, aucfrom sklearn.preprocessing import LabelBinarizerdef class_report(y_true, y_pred, y_score=None, average='micro'): if y_true.shape != y_pred.shape: print("Error! y_true %s is not the same shape as y_pred %s" % ( y_true.shape, y_pred.shape) ) return lb = LabelBinarizer() if len(y_true.shape) == 1: lb.fit(y_true) #Value counts of predictions labels, cnt = np.unique( y_pred, return_counts=True) n_classes = len(labels) pred_cnt = pd.Series(cnt, index=labels) metrics_summary = precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, labels=labels) avg = list(precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, average='weighted')) metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] class_report_df = pd.DataFrame( list(metrics_summary), index=metrics_sum_index, columns=labels) support = class_report_df.loc['support'] total = support.sum() class_report_df['avg / total'] = avg[:-1] + [total] class_report_df = class_report_df.T class_report_df['pred'] = pred_cnt class_report_df['pred'].iloc[-1] = total if not (y_score is None): fpr = dict() tpr = dict() roc_auc = dict() for label_it, label in enumerate(labels): fpr[label], tpr[label], _ = roc_curve( (y_true == label).astype(int), y_score[:, label_it]) roc_auc[label] = auc(fpr[label], tpr[label]) if average == 'micro': if n_classes <= 2: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score[:, 1].ravel()) else: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score.ravel()) roc_auc["avg / total"] = auc( fpr["avg / total"], tpr["avg / total"]) elif average == 'macro': # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([ fpr[i] for i in labels] )) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in labels: mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"]) class_report_df['AUC'] = pd.Series(roc_auc) return class_report_df
Here is some example:
from sklearn.metrics import classification_reportfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.datasets import make_classificationX, y = make_classification(n_samples=5000, n_features=10, n_informative=5, n_redundant=0, n_classes=10, random_state=0, shuffle=False)X_train, X_test, y_train, y_test = train_test_split(X, y)model = RandomForestClassifier(max_depth=2, random_state=0)model.fit(X_train, y_train)
Regular classification_report
:
sk_report = classification_report( digits=6, y_true=y_test, y_pred=model.predict(X_test))print(sk_report)
Out:
precision recall f1-score support 0 0.262774 0.553846 0.356436 130 1 0.405405 0.333333 0.365854 135 2 0.367347 0.150000 0.213018 120 3 0.350993 0.424000 0.384058 125 4 0.379310 0.447154 0.410448 123 5 0.525000 0.182609 0.270968 115 6 0.362573 0.488189 0.416107 127 7 0.330189 0.299145 0.313901 117 8 0.328571 0.407080 0.363636 113 9 0.571429 0.248276 0.346154 145avg / total 0.390833 0.354400 0.345438 1250
Custom classification_report:
report_with_auc = class_report( y_true=y_test, y_pred=model.predict(X_test), y_score=model.predict_proba(X_test))print(report_with_auc)
Out:
precision recall f1-score support pred AUC0 0.262774 0.553846 0.356436 130.0 274.0 0.7664771 0.405405 0.333333 0.365854 135.0 111.0 0.7739742 0.367347 0.150000 0.213018 120.0 49.0 0.8173413 0.350993 0.424000 0.384058 125.0 151.0 0.8033644 0.379310 0.447154 0.410448 123.0 145.0 0.8024365 0.525000 0.182609 0.270968 115.0 40.0 0.6808706 0.362573 0.488189 0.416107 127.0 171.0 0.8557687 0.330189 0.299145 0.313901 117.0 106.0 0.7665268 0.328571 0.407080 0.363636 113.0 140.0 0.7548129 0.571429 0.248276 0.346154 145.0 63.0 0.769100avg / total 0.390833 0.354400 0.345438 1250.0 1250.0 0.776071