Python resampling
-
Upload
pydata -
Category
Data & Analytics
-
view
128 -
download
1
Transcript of Python resampling
In[2]: fromsklearn.datasetsimportmake_classificationfromsklearn.decompositionimportPCAimportmatplotlib.pyplotaspltfromsklearn.linear_modelimportLogisticRegressionfromsklearn.metricsimportclassification_reportfromsklearn.grid_searchimportGridSearchCVfromsklearn.cross_validationimportKFold,train_test_splitimportnumpyasnpfromcollectionsimportCounterfromimblearn.under_samplingimportRandomUnderSampler,NearMiss,CondensedNearestNeighbourfromimblearn.under_samplingimportEditedNearestNeighbours,RepeatedEditedNearestNeighbours,TomekLinksfromimblearn.over_samplingimportRandomOverSampler,SMOTEfromimblearn.combineimportSMOTEENN,SMOTETomekfromimblearn.ensembleimportBalanceCascade,EasyEnsemblefromsklearn.ensembleimportAdaBoostClassifierimportwarningsfromitableimportPrettyTable,TableStyle,CellStyleimportpandasaspdwarnings.filterwarnings('ignore')%pylabinlinepylab.rcParams['figure.figsize']=(12,6)plt.style.use('fivethirtyeight')
Populatingtheinteractivenamespacefromnumpyandmatplotlib
In[4]: #GeneratedatawithtwoclassesX,y=make_classification(class_sep=1.2,weights=[0.1,0.9],n_informative=3,n_redundant=1,n_features=5,n_clusters_per_class=1,n_samples=10000,flip_y=0,random_state=10)pca=PCA(n_components=2)X=pca.fit_transform(X)
y=y.astype('str')y[y=='1']='L'y[y=='0']='S'
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=0)
X_1,X_2=X_train[y_train=='S'],X_train[y_train=='L']
In[5]: #Scatterplotofthedataplt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')
x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("OriginalDataset")plt.show()
In[6]: #FitaLogisticRegressionmodelclf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train,y_train)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
In[7]: plt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')
x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("OriginalDataset-FittedLogisticRegression")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[8]: printclassification_report(y_test,clf.predict(X_test))
precisionrecallf1-scoresupport
L0.900.980.942680S0.390.120.19320
avg/total0.850.890.863000
In[10]: #LogisticRegressionwithbalancedclassweightsclf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2'],'class_weight':['balanced']}
cv=KFold(X_train.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train,y_train)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
In[11]: x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
plt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')
x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("OriginalDataset-FittedLogisticRegression")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[13]:
In[14]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.980.790.882680S0.340.890.49320
avg/total0.920.800.843000
Out[14]:
In[15]: #Randomundersamplingofmajorityclassus=RandomUnderSampler(ratio=0.5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})
In[16]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("RandomUndersamplingofmajorityclass")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[18]:
In[19]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.970.830.902680S0.360.820.50320
avg/total0.910.830.853000
Out[19]:
In[20]: #NearMiss1us=NearMiss(ratio=0.5,size_ngh=3,version=1,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})
In[21]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("NearMiss1")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[23]:
In[24]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.920.950.942680S0.440.320.37320
avg/total0.870.880.883000
Out[24]:
In[25]: #NearMiss2us=NearMiss(ratio=0.5,size_ngh=3,version=2,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})
In[26]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("NearMiss2")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[28]:
In[29]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.950.890.922680S0.390.600.47320
avg/total0.890.860.873000
Out[29]:
In[30]: #NearMiss3us=NearMiss(ratio=0.5,size_ngh=3,ver3_samp_ngh=3,version=3,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':964,'S':680})
In[31]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("NearMiss3")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[33]:
In[34]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.910.970.942680S0.430.200.28320
avg/total0.860.890.873000
Out[34]:
In[35]: #CondensedNearestNeighborus=CondensedNearestNeighbour(random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':882,'S':680})
In[36]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("CondensedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[38]:
In[39]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.930.940.932680S0.440.390.42320
avg/total0.880.880.883000
Out[39]:
In[40]: #EditedNearestNeighbor(ENN)us=EditedNearestNeighbours(size_ngh=5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':5120,'S':680})
In[41]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("EditedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[43]:
In[44]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.950.900.922680S0.420.590.49320
avg/total0.890.870.883000
Out[44]:
In[45]: #RepeatedEditedNearestNeighborus=RepeatedEditedNearestNeighbours(size_ngh=5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':4796,'S':680})
In[46]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("RepeatedEditedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[48]:
In[49]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.970.840.902680S0.380.800.51320
avg/total0.910.840.863000
Out[49]:
In[50]: #Tomeklinkremovalus=TomekLinks(random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6051,'S':680})
In[51]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("TomekLinkRemoval")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[53]:
In[54]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.910.970.942680S0.440.210.29320
avg/total0.860.890.873000
Out[54]:
In[55]: #Randomoversamplingofminorityclassos=RandomOverSampler(ratio=0.5,random_state=1)X_train_res,y_train_res=os.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6320,'S':3160})
In[56]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("RandomOversamplingofminorityclass")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[58]:
In[59]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.970.850.912680S0.380.760.51320
avg/total0.910.840.863000
Out[59]:
In[60]: #SyntheticMinorityOversamplingTechnique(SMOTE)os=SMOTE(ratio=0.5,k=5,random_state=1)X_train_res,y_train_res=os.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6320,'S':3160})
In[61]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("SMOTE")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[63]:
In[64]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.970.850.912680S0.380.770.51320
avg/total0.910.840.863000
Out[64]:
In[65]: #SMOTE+Tomeklinkremovalos_us=SMOTETomek(ratio=0.5,k=5,random_state=1)X_train_res,y_train_res=os_us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6050,'S':3160})
In[66]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("SMOTE+TomekLinkRemoval")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[68]:
In[69]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.970.840.902680S0.380.800.51320
avg/total0.910.840.863000
Out[69]:
In[70]: #SMOTE+ENNos_us=SMOTEENN(ratio=0.5,k=5,size_ngh=5,random_state=1)X_train_res,y_train_res=os_us.fit_sample(X_train,y_train)
print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))
clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_
x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]
X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']
DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':4894,'S':3160})
In[71]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')
x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])
plt.title("SMOTE+ENN")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()
In[73]:
In[74]:
printclassification_report(y_test,clf.predict(X_test))
precision_recall_comparison
precisionrecallf1-scoresupport
L0.990.780.872680S0.330.920.49320
avg/total0.920.790.833000
Out[74]:
In[75]: #EasyEnsembleens=EasyEnsemble()X_train_res,y_train_res=ens.fit_sample(X_train,y_train)
y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):clf_base=AdaBoostClassifier()grid={'n_estimators':[10,50,100]}
cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'
In[77]:
In[78]:
printclassification_report(y_test,y_pred)
precision_recall_comparison
precisionrecallf1-scoresupport
L0.980.800.882680S0.350.900.50320
avg/total0.920.810.843000
Out[78]:
In[79]: #BalanceCascade
#NotethisisnotatrueimplementationofBalancaCacadesincethelibrary#doesnotreturnthefittedclassifiersusedingeneratingthesubsetsens=BalanceCascade(classifier='adaboost',random_state=1)X_train_res,y_train_res=ens.fit_sample(X_train,y_train)
y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):#imblearnusesAdaBoostClassiiferwithdefaulthyperparamstoselectsubsets#avariantimplementationwithgridsearchforeachsubsampleisshowninthenextcellclf=AdaBoostClassifier(random_state=1)clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'
In[81]:
In[82]:
printclassification_report(y_test,y_pred)
precision_recall_comparison
precisionrecallf1-scoresupport
L0.990.810.892680S0.360.910.51320
avg/total0.920.820.853000
Out[82]:
In[83]: #NotethisisnotatrueimplementationofBalancaCacadesincethelibrary#doesnotreturnthefittedclassifiersusedingeneratingthesubsetsens=BalanceCascade(classifier='adaboost',random_state=1)X_train_res,y_train_res=ens.fit_sample(X_train,y_train)
y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):clf_base=AdaBoostClassifier(random_state=1)grid={'n_estimators':[10,50,100]}cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'
In[85]:
In[86]:
printclassification_report(y_test,y_pred)
precision_recall_comparison
precisionrecallf1-scoresupport
L0.990.800.882680S0.350.900.50320
avg/total0.920.810.843000
Out[86]: