Training different scikit-learn classifiers on multiple CPUs for each iteration
This is less of an answer and more of a rough sketch of an answer to your first question,
How would I be able to train the classifiers using different threads
for every iteration of for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)
I assume by this you mean that for each iteration of for name, clf, clf2 in zip(names, classifiers, classifiers2):
you want clf
and clf2
trained on different processors.
Here is some working code as a starting point (it is poorly implemented, but the general idea is there):
from itertools import productimport numpy as npimport multiprocessingfrom sklearn.svm import SVCfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifierfrom sklearn.naive_bayes import GaussianNB, MultinomialNBfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysisfrom sklearn.datasets import make_classificationfrom sklearn.preprocessing import StandardScalerfrom sklearn.cross_validation import train_test_splitnames = ["Linear SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]# def mp_handler():# p = multiprocessing.Pool(8)# p.map(mp_worker, data) def mp_worker((name, clf, X_train, y_train, X_test, y_test, num_features_to_remove)): if num_features_to_remove == False: clf.fit(X_train, y_train) return ('score1', clf.score(X_test, y_test)) clf.fit(X_train[:,:-num_feats_to_remove], y_train) return ('score2', clf.score(X_test[:,:-num_feats_to_remove], y_test))def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): classifiers = [ SVC(kernel="linear", C=0.025), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()] classifiers2 = [ SVC(kernel="linear", C=0.025), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()] X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) for name, clf, clf2 in zip(names, classifiers, classifiers2): p = multiprocessing.Pool(2) #set to 2 for using two processors; one processor per classfier #The integer parameter you pass to Pool is equal to the number of SETS of classifiers you have data = (name, clf, X_train, y_train, X_test, y_test, False), (name, clf, X_train, y_train, X_test, y_test, num_feats_to_remove) res = p.map(mp_worker, data) #this splits the two classification tasks acrpss two separate processors for i,j in res: #parse the results if i == 'score1': score1 = j else: score2 = j yield (num_samples, num_feats, num_feats_to_remove, name, score1, score2)if __name__ == '__main__': _samples = [100, 200] _feats = [10, 20] _feats_to_rm = [5, 10] for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm): if num_feats <= num_feats_to_remove: continue for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): print (i)
If I misunderstood your question, then the general principle in the code above can be modified to suit your needs. I draw from the the accepted answer here in the code above.