Training different scikit-learn classifiers on multiple CPUs for each iteration Training different scikit-learn classifiers on multiple CPUs for each iteration numpy numpy

Training different scikit-learn classifiers on multiple CPUs for each iteration


This is less of an answer and more of a rough sketch of an answer to your first question,

How would I be able to train the classifiers using different threads for every iteration of for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)

I assume by this you mean that for each iteration of for name, clf, clf2 in zip(names, classifiers, classifiers2): you want clf and clf2 trained on different processors.

Here is some working code as a starting point (it is poorly implemented, but the general idea is there):

from itertools import productimport numpy as npimport multiprocessingfrom sklearn.svm import SVCfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifierfrom sklearn.naive_bayes import GaussianNB, MultinomialNBfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysisfrom sklearn.datasets import make_classificationfrom sklearn.preprocessing import StandardScalerfrom sklearn.cross_validation import train_test_splitnames = ["Linear SVM", "Decision Tree",     "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",     "Quadratic Discriminant Analysis"]# def mp_handler():#     p = multiprocessing.Pool(8)#     p.map(mp_worker, data) def mp_worker((name, clf, X_train, y_train, X_test, y_test, num_features_to_remove)):    if num_features_to_remove == False:        clf.fit(X_train, y_train)        return ('score1', clf.score(X_test, y_test))    clf.fit(X_train[:,:-num_feats_to_remove], y_train)    return ('score2', clf.score(X_test[:,:-num_feats_to_remove], y_test))def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):     classifiers = [        SVC(kernel="linear", C=0.025),        DecisionTreeClassifier(max_depth=5),        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),        AdaBoostClassifier(), GaussianNB(),        LinearDiscriminantAnalysis(),        QuadraticDiscriminantAnalysis()]    classifiers2 = [        SVC(kernel="linear", C=0.025),        DecisionTreeClassifier(max_depth=5),        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),        AdaBoostClassifier(), GaussianNB(),        LinearDiscriminantAnalysis(),        QuadraticDiscriminantAnalysis()]    X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2,                           random_state=1, n_clusters_per_class=1)    X = StandardScaler().fit_transform(X)    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)    for name, clf, clf2 in zip(names, classifiers, classifiers2):        p = multiprocessing.Pool(2) #set to 2 for using two processors; one processor per classfier        #The integer parameter you pass to Pool is equal to the number of SETS of classifiers you have        data = (name, clf, X_train, y_train, X_test, y_test, False), (name, clf, X_train, y_train, X_test, y_test, num_feats_to_remove)        res =  p.map(mp_worker, data) #this splits the two classification tasks acrpss two separate processors        for i,j in res: #parse the results            if i == 'score1':                score1 = j            else:                score2 = j        yield (num_samples, num_feats, num_feats_to_remove, name, score1, score2)if __name__ == '__main__':    _samples = [100, 200]    _feats = [10, 20]    _feats_to_rm = [5, 10]    for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm):        if num_feats <= num_feats_to_remove:            continue        for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):            print (i)

If I misunderstood your question, then the general principle in the code above can be modified to suit your needs. I draw from the the accepted answer here in the code above.