Problem: Predict whether there is rainfall at a location based on (processed) infrared satellite image information.
The dataset is courtesy of UC Irvine's Center for Hydrometeorology and Remote Sensing, including Dr. Soroosh Sorooshian, Dr. Xiaogang Gao, Dr. Kuo-lin Hsu, Dan Braithwaite, Yumeng Tau, and Negar Karbalee.
# Plot inline
%matplotlib inline
import numpy as np
from scipy import stats
import mltools as ml
import matplotlib.pyplot as plt
import pandas as pd
import project_utils as pjkt
# import sklearn # To be modified based on the exact learning algorithms required
np.random.seed(0) # Set seed to reproduce outputs
X_train = np.genfromtxt('data/X_train.txt', delimiter = None)
X_test = np.genfromtxt('data/X_test.txt', delimiter = None)
Y_train = np.genfromtxt('data/Y_train.txt', delimiter = None)
# Verify data import - check number of rows (200k in train and test), columns (14)
print X_train.shape, Y_train.shape, X_test.shape
pjkt.print_summary_statistics(X_train)
# Summary statistics of Y_train
pjkt.print_summary_statistics(Y_train)
y_unique = np.unique(Y_train)
print '\nUnique Y values = ', np.unique(Y_train)
for i in range(len(np.unique(Y_train))):
print "Number of rows with Y =", int(y_unique[i]), "is", np.sum(Y_train[:,] == y_unique[i])
Note: Mean < Median & Mean > Median relationships not aligning with skew for some features
X_train_scaled, _ = ml.transforms.rescale(X_train)
pjkt.print_summary_statistics(X_train_scaled) # Mean 0 and Variance 1
print np.sum(np.equal(X_train_scaled, None))
print np.sum(np.isnan(X_train_scaled))
No missing or NaN values
pjkt.bar_plots(X_train_scaled, Y_train)
pjkt.pair_plots(X_train_scaled, Y_train, 0, 1)
pjkt.pair_plots(X_train_scaled, Y_train, 0, 2)
# Toggle to view output
# Convert numpy array into a pandas dataframe
temp = pd.DataFrame(X_train)
# Can change this to 0:14 to get entire dataframe; takes a lot of time
axes = pd.tools.plotting.scatter_matrix(temp)
plt.tight_layout()
xtr_sub, xte_sub, ytr_sub, yte_sub = pjkt.data_seq_split(X_train, Y_train, 10000)
print xtr_sub.shape, xte_sub.shape, ytr_sub.shape, yte_sub.shape
Alternatively, we could use ml.splitData(X, Y, train_fraction). ml.crossValidate() can be used for cross validation.
e1 = np.random.normal(size=10)
e2 = np.random.normal(size=10)
k = np.arange(10) + 1
pjkt.plot_errors(e1, e2, k)
def plot_errors_semilog(errTrain, errValidate, x_axis, semi = 'semilogx'):
# Plot training vs. validation error
figure, axis1 = plt.subplots()
axis1.set_ylabel('Error')
if(semi == 'semilogx'):
axis1.semilogx(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
axis1.semilogx(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
else:
axis1.semilogy(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
axis1.semilogy(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
plt.title('Train (red) vs. Validation (green) error')
plt.show()
pjkt.plot_errors_semilog(e1, e2, k, 'semilogx')
from random import randint
e1 = [randint(10, 200) for i in range(10)]
e2 = [randint(10, 200) for i in range(10)]
pjkt.plot_errors_semilog(e1, e2, k, 'semilogy')
whos
from sklearn import svm
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 200000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
## learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7}, probability=True) - Submitted to Kaggle: 0.64796
learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7})#, probability=True)
# Performs better on training and validation - not on kaggle 0.64470
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
#ytest_pred_soft = learner.predict_proba(X_test_scaled)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub), '\n'
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])
# Save the predictions in the format required by Kaggle - weights 0.5, 0.7
#np.savetxt('Yhat_svm_test.txt', np.vstack( (np.arange(len(ytest_pred_soft)) ,
# ytest_pred_soft[:, 1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64796
# Weights 0.5, 0.6
np.savetxt('Yhat_svm_test2.txt', np.vstack( (np.arange(len(ytest_pred_soft)) ,
ytest_pred_soft[:, 1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
To do - Vary:
a) Amount of data used for training and validation
b) Features, selection method and their transforms
c) Kernel
d) Regularization parameters
e) Plot errors as function of above
Upload on Kaggle
y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(ytr_sub, y_score)
print metrics.roc_auc_score(ytr_sub, ytr_pred)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()
learner = svm.SVC(class_weight = 'balanced')
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print float(len(ytr_sub[ytr_sub==1])/float((len(ytr_sub[ytr_sub==0]))))
featlist = [0, 6, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
# feature 7 has a lot of zeros
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = svm.SVC()
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
np.random.seed(0)
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 50000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = svm.LinearSVC(dual=False) # Balanced weighting not working out
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 20000, 40000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:40000,], Y_train[:40000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[40000:50000,], Y_train[40000:50000,], 1.0)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
from sklearn.neural_network import MLPClassifier
featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
ytest_pred_soft = learner.predict_proba(X_test_scaled)
# Kaggle for - 20k training, 10k validation
# learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
np.savetxt('Yhat_nn_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) ,
ytest_pred_soft[:, 1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# ~0.60 kaggle score
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
num_clusters = [2]
#initial = ['random', 'farthest', 'k++'] # Different initialization methods
initial = ['k++']
np.random.seed(0)
for i, k in enumerate(num_clusters):
for j in range(len(initial)):
z, c, sumd = ml.cluster.kmeans(X = xtr_sub, K = k, init = initial[j])
print str(k), " clusters with initialization:", initial[j]
print "Sum of squared Euclidean distances:", str(sumd)
print "Ratio of 1s and 0s (training) = ",len(z[z==1]), len(z[z==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)
num_clusters = [2]
initial = ['k++']
np.random.seed(0)
featlist = [0, 13, 8, 2, 12]
for i, k in enumerate(num_clusters):
for j in range(len(initial)):
#z, T, soft, ll = ml.cluster.gmmEM(X = xtr_sub[:,featlist], K = k, init = initial[j])
#zval, T, soft, llval = ml.cluster.gmmEM(X = xval_sub[:,featlist], K = k, init = initial[j])
ztest, T, soft, lltest = ml.cluster.gmmEM(X = X_test_scaled, K = k, init = initial[j])
print str(k), " clusters with initialization: ", initial[j]
print "Log likelihood (training): ", str(ll), "\n"
print "Log likelihood (validation): ", str(llval), "\n"
z = 1 - z
zval = 1 - zval
ztest = 1 - ztest
#zval[zval==1] = 0
#zval[zval==0] = 1
print "Ratio of 1s and 0s = ",len(z[z==1]), len(z[z==0])
print "Ratio of 1s and 0s in original training= ",len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)
print "Ratio of 1s and 0s (validation) = ",len(zval[zval==1]), len(zval[zval==0])
print "Ratio of 1s and 0s in original validation= ",len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print "Validation error: ", float(np.sum(zval != yval_sub))/len(yval_sub)
# Append the clusters to the training data
check = np.reshape(z, newshape=(len(z), 1))
print check.shape
xtr_new = np.append(xtr_sub, check, 1)
print xtr_new.shape
check = np.reshape(zval, newshape=(len(zval), 1))
print check.shape
xval_new = np.append(xval_sub, check, 1)
print xval_new.shape
# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True)
learner = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_new[:,featlist])
yval_pred = learner.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True)
learner2 = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner2.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner2.predict(xtr_new[:,featlist])
yval_pred = learner2.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
check = np.reshape(ztest, newshape=(len(ztest), 1))
print check.shape
xtest_new = np.append(X_test_scaled[:, featlist], check, 1)
print xtest_new.shape
ytest_pred_soft = learner2.predict_proba(xtest_new)
# learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True)
np.savetxt('Yhat_nncluster_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) ,
ytest_pred_soft[:, 1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle - 0.64715
for i in range(14):
plt.scatter(xtr_new[:,i], ytr_sub)
plt.title('Feature '+str(i+1))
plt.show()
#plt.scatter((X_train_scaled[:,0]), Y_train)
plt.scatter(np.exp(220 + xtr_new[:,6]), ytr_sub)
corrList = []
for i in range(14):
print "Feature ", i
print np.corrcoef(xtr_new[:, i], ytr_sub)
import graphlab
sf_xtrain = graphlab.SFrame.read_csv('data/X_train.txt', header = False, delimiter=' ')
sf_xtest = graphlab.SFrame.read_csv('data/X_test.txt', header=False, delimiter=' ')
sf_ytrain = graphlab.SFrame.read_csv('data/Y_train.txt', header=False, delimiter=' ')
sf_xtrain.print_rows
graphlab.canvas.set_target('browser')
sf_xtrain.show()
sf_alltrain = sf_xtrain.add_column(graphlab.SArray(Y_train))
sf_alltrain.print_rows
sf_alltrain['X15'] = sf_alltrain['X15'].astype(int)
sf_subtrain = sf_alltrain[0:100000]
sf_subval = sf_alltrain[100000:150000]
print sf_subtrain.print_rows()
nn = graphlab.neuralnet_classifier.create(sf_subtrain, 'X15')
pred = nn.classify(sf_subtrain)
pred.print_rows(1000)
sf_valpred = nn.evaluate(sf_subval)
sf_valpred
nn.show()
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
logistic = graphlab.logistic_classifier.create(sf_subtrain, 'X15', l2_penalty=0.5)
trainpred_log = logistic.classify(sf_subtrain)
valpred_log = logistic.classify(sf_subval)
sf_traineval_log = logistic.evaluate(sf_subtrain)
sf_valeval_log = logistic.evaluate(sf_subval)
print sf_traineval, sf_valeval
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_log['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_log['class'])
logistic.show()
testpred = logistic.classify(sf_xtest)
testpred.show()
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
temp2 = trainpred_log
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
check = graphlab.SArray.to_numpy(temp['prob2'])
check_log = check
check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_log = check_train
rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
trainpred = rf.classify(sf_subtrain)
valpred = rf.classify(sf_subval)
sf_traineval = rf.evaluate(sf_subtrain)
sf_valeval = rf.evaluate(sf_subval)
print sf_traineval, sf_valeval
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])
rf.show()
testpred = rf.classify(sf_xtest)
testpred.show()
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
temp2 = trainpred
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
check = graphlab.SArray.to_numpy(temp['prob2'])
check_rf = check
check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_rf = check_train
# 0.65124 Kaggle - Vanilla Random Forests
# rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
np.savetxt('Yhat_rfgraphlab_test1.txt', np.vstack( (np.arange(len(check)) ,
check) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
#featlist = ['X1', 'X14', 'X9', 'X3', 'X13', 'X4']
rf2 = graphlab.random_forest_classifier.create(sf_subtrain, 'X15',
max_iterations=30, min_child_weight = 10, validation_set=sf_subval
, random_seed = 0)
trainpred = rf2.classify(sf_subtrain)
valpred = rf2.classify(sf_subval)
sf_traineval = rf2.evaluate(sf_subtrain)
sf_valeval = rf2.evaluate(sf_subval)
print sf_traineval, sf_valeval
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])
rf2.show()
#print valpred_log, valpred
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])
temp = np.column_stack((logclass, rfclass, valclass))
#bothclass = np.ndarray((len(logclass, )))
log_vs_val = (temp[:, 0] == temp[:, 2]).astype(int)
rf_vs_val = (temp[:, 1] == temp[:, 2]).astype(int)
temp2 = np.column_stack((temp, log_vs_val, rf_vs_val))
print temp2.shape
print temp2
len(np.where(log_vs_val==rf_vs_val)[0])
final_val = np.maximum(logclass, rfclass)
print final_val
print "Validation error: ", float(np.sum(rfclass != valclass))/len(valclass)
np.vstack({tuple(row) for row in X_train})
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
dt_xtr, dt_xtest, dt_ytr, dt_ytest = ml.splitData(X_train, Y_train, 0.75)
params={'criterion' : 'entropy',
'max_depth' : 7,
#'min_samples_split' : 350,
#'min_samples_leaf': 50,
'class_weight':'balanced'}
bdt = AdaBoostClassifier(DecisionTreeClassifier(**params),
algorithm="SAMME",
n_estimators=150)
bdt.fit(dt_xtr, dt_ytr)
print bdt.score(dt_xtest,dt_ytest)
fpr = dict()
tpr = dict()
roc_auc = dict()
ypred2=bdt.predict_proba(dt_xtest)
fpr, tpr, _ = roc_curve(dt_ytest, ypred2[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr,tpr,'b-')
print roc_auc
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
svm = graphlab.svm_classifier.create(sf_subtrain, 'X15', max_iterations=70, class_weights={0:1, 1:1.5},
validation_set=sf_subval, convergence_threshold = 0.001)
trainpred_svm = svm.classify(sf_subtrain)
valpred_svm = svm.classify(sf_subval)
traineval_svm = svm.evaluate(sf_subtrain)
valeval_svm = svm.evaluate(sf_subval)
svm.show()
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_svm['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_svm['class'])
testpred = svm.classify(sf_xtest)
#temp = testpred
#temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
testpred
bt = graphlab.boosted_trees_classifier.create(sf_subtrain, 'X15', max_depth=8, validation_set=sf_subval,
row_subsample=0.85, column_subsample = 0.6, random_seed=1)
trainpred_bt = bt.classify(sf_subtrain)
valpred_bt = bt.classify(sf_subval)
traineval_bt = bt.evaluate(sf_subtrain)
valeval_bt = bt.evaluate(sf_subval)
print traineval_bt, valeval_bt
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_bt['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_bt['class'])
bt.show()
testpred = bt.classify(sf_xtest)
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
temp2 = trainpred_bt
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
check = graphlab.SArray.to_numpy(temp['prob2'])
check_bt = check
check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_bt = check_train
#np.savetxt('Yhat_btgraphlab_test1.txt', np.vstack( (np.arange(len(check)) ,
# check) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
np.savetxt('Yhat_btgraphlab_test2.txt', np.vstack( (np.arange(len(check)) ,
check) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64205
testpred.show()
print "Validation error: ", float(np.sum(graphlab.SArray.to_numpy(valpred_bt['class']) != valclass))/len(valclass)
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
nFolds = 5; # Initialize number of folds to be 5
J = np.empty([ nFolds], dtype=float)
# Create an empty (M, 5) float array to store the 5-fold validation
# errors from the different models
folds = graphlab.cross_validation.KFold(sf_alltrain, 5)
for train, valid in folds:
m = graphlab.boosted_trees_classifier.create(train, target='X15', validation_set=None, max_depth=20)
print m.evaluate(valid)
#J[iFold] = float(np.sum(Yvi_pred != Yvi))/len(Yvi)
#Jmean = np.mean(J) # Overall estimated validation performance for each model
#print "Cross Validation error: \n", Jmean
#learner = svm (from scipy), svm (graphlab), bt, rf, logistic
from sklearn.externals import joblib
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
# Save all the learners
joblib.dump(learner, 'svm.pkl')
svm.save('svm')
bt.save('bt')
rf.save('rf')
logistic.save('logistic')
valpred_bt.shape
# Only validation data
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
btclass = graphlab.SArray.to_numpy(valpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(valpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])
#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))
# Only training data
logclass = graphlab.SArray.to_numpy(trainpred_log['class'])
rfclass = graphlab.SArray.to_numpy(trainpred['class'])
btclass = graphlab.SArray.to_numpy(trainpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(trainpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])
#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))
# gnb = GaussianNB()
gnb = RandomForestClassifier(max_features=None)
# If NB on training data
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 120000, 150000)
# If NB on validation data
#temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 35000, 50000)
gnb.fit(temptrain, trainclass)
print temptrain.shape, temptest.shape, temp.shape, trainclass.shape, testclass.shape
gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)
print gnb_train_pred, trainclass
print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass), '\n'
print "All error: ", float(np.sum(gnb_all_pred != valclass))/len(valclass), '\n'
print "SVM error: ", float(np.sum(svmclass != valclass))/len(valclass), '\n'
print "RF error: ", float(np.sum(rfclass != valclass))/len(valclass), '\n'
print "Logistic error: ", float(np.sum(logclass != valclass))/len(valclass), '\n'
print "BT error: ", float(np.sum(btclass != valclass))/len(valclass), '\n'
#print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
#print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])
#print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
#print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])
SVM not adding to the predictions
logtrain = graphlab.SArray.to_numpy(trainpred_log['class'])
rftrain = graphlab.SArray.to_numpy(trainpred['class'])
bttrain = graphlab.SArray.to_numpy(trainpred_bt['class'])
trains = np.column_stack((logtrain, rftrain, bttrain))
alltrainpred = np.empty(len(trainpred), dtype=float)
for i in range(len(alltrainpred)):
if(np.sum(trains[i]) >= 2):
alltrainpred[i] = 1
else:
alltrainpred[i] = 0
alltrainpred
print len(alltrainpred[alltrainpred==1]), len(alltrainpred[alltrainpred==0])
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
temp = np.column_stack((check_train_log, check_train_bt, check_train_rf))
print check_train_bt, trainpred_bt
np.random.seed(0)
alltrainclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, alltrainclass, 100000, 150000)
#gnb = RandomForestClassifier(max_features=None, min_samples_split=50, min_samples_leaf=50, max_depth=3)
gnb = GaussianNB()
gnb.fit(temptrain, trainclass)
gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)
print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass)
print "All error: ", float(np.sum(gnb_all_pred != alltrainclass))/len(alltrainclass), '\n'
#y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(gnb_all_pred, alltrainclass)
print metrics.roc_auc_score(gnb_all_pred, alltrainclass)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()
finalTest = np.column_stack((check_log, check_bt, check_rf))
finalTest
# Predict on all test data
finalPred = gnb.predict(finalTest)
finalPredSoft = gnb.predict_proba(finalTest)
print len(finalPred[finalPred==1]), len(finalPred[finalPred==0])
meanPredSoft = np.mean(finalTest, axis=1)
print meanPredSoft
finalPredSoft
#np.savetxt('Yhat_finalensemble_test1.txt', np.vstack( (np.arange(len(finalPredSoft)) ,
# finalPredSoft[:, 1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# 0.61041 Kaggle using Random Forests
# Yet to test
#np.savetxt('Yhat_finalensemble_nb_test2.txt', np.vstack( (np.arange(len(finalPredSoft)) ,
# finalPredSoft[:, 1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
#Kaggle score - 0.66042
#np.savetxt('Yhat_finalensemble_rf_test3.txt', np.vstack( (np.arange(len(finalPredSoft)) ,
# finalPredSoft[:, 1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
#Kaggle score - 0.64273
np.savetxt('Yhat_finalensemble_mean_test4.txt', np.vstack( (np.arange(len(meanPredSoft)) ,
meanPredSoft) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');