CS 273A Machine Learning Project

Team: Priyanka Ravi and Rahul Sridhar

Problem: Predict whether there is rainfall at a location based on (processed) infrared satellite image information.

The dataset is courtesy of UC Irvine's Center for Hydrometeorology and Remote Sensing, including Dr. Soroosh Sorooshian, Dr. Xiaogang Gao, Dr. Kuo-lin Hsu, Dan Braithwaite, Yumeng Tau, and Negar Karbalee.

In [1]:
# Plot inline
%matplotlib inline 
In [2]:
import numpy as np
from scipy import stats 
import mltools as ml
import matplotlib.pyplot as plt
import pandas as pd
import project_utils as pjkt
# import sklearn # To be modified based on the exact learning algorithms required
In [3]:
np.random.seed(0) # Set seed to reproduce outputs

Load the data

In [4]:
X_train = np.genfromtxt('data/X_train.txt', delimiter = None)
X_test = np.genfromtxt('data/X_test.txt', delimiter = None)
Y_train = np.genfromtxt('data/Y_train.txt', delimiter = None)
In [5]:
# Verify data import - check number of rows (200k in train and test), columns (14)
print X_train.shape, Y_train.shape, X_test.shape
(200000, 14) (200000,) (200000, 14)

Function to print summary statistics of the input data

In [6]:
pjkt.print_summary_statistics(X_train)
Size = 200000

Min = [ 193.       190.       214.97     205.42      10.         0.         0.
    0.         0.68146    0.         0.         0.         1.0074  -999.9    ]

Max = [   253.       250.5      252.5      252.5    17130.     12338.      9238.
     35.796     19.899     11.368     21.466     14.745    278.71     782.5  ]

Mean = [  241.7972204    228.22826005   241.79629755   233.64929865  2867.97959
   884.073295     173.553355       3.04719572     6.35196722     1.92523232
     4.29379349     2.80947178    10.36791465     7.8733445 ]

Median = [  243.5      229.5      242.76     233.32    1576.       179.         0.
     2.1555     6.0169     1.4382     3.7652     2.4721     4.4687     0.    ]

Variance = [       82.69497538        90.95784934        35.72575822
        95.26133023  10619471.14179957   3257046.13084505
    740659.83692065         7.42247988         6.33233079
         4.28450846         4.0468611          1.98219294
       166.68008517      1410.80384676]

Skewness = [-1.13265567 -0.99837376 -1.23096732 -0.17701892  1.73369199  3.81828478
  7.50159872  2.61068979  1.1083718   1.53662083  1.58038185  1.21605645
  6.18547305 -7.03908282]

Kurtosis = [   1.57114747    1.21442682    2.14380763   -0.08684285    2.89426892
   17.00858034   64.01137776   10.31285653    1.91229596    2.5101101
    4.07792073    2.15174142   77.17391705  330.71488398]
In [7]:
# Summary statistics of Y_train
pjkt.print_summary_statistics(Y_train)
y_unique = np.unique(Y_train)
print '\nUnique Y values = ', np.unique(Y_train)
for i in range(len(np.unique(Y_train))):
    print "Number of rows with Y =", int(y_unique[i]), "is", np.sum(Y_train[:,] == y_unique[i])
Size = 200000

Min = 0.0

Max = 1.0

Mean = 0.367245

Median = 0.0

Variance = 0.232377271861

Skewness = 0.550788855253

Kurtosis = -1.69663163693

Unique Y values =  [ 0.  1.]
Number of rows with Y = 0 is 126551
Number of rows with Y = 1 is 73449

Note: Mean < Median & Mean > Median relationships not aligning with skew for some features

Rescale the data

In [7]:
X_train_scaled, _ = ml.transforms.rescale(X_train)
In [9]:
pjkt.print_summary_statistics(X_train_scaled) # Mean 0 and Variance 1
Size = 200000

Min = [ -5.3660734   -4.00834898  -4.48818857  -2.89229677  -0.87701852
  -0.48986612  -0.20166229  -1.11847665  -2.25341508  -0.93010874
  -2.13443577  -1.99550489  -0.72503497 -26.83061548]

Max = [  1.23193365   2.33525947   1.79078887   1.93139133   4.37653791
   6.34663512  10.5325368   12.02048577   5.38348454   4.56194336
   8.53626795   8.47753843  20.78490383  20.62339716]

Mean = [ 0.  0.  0. -0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0.]

Median = [ 0.1872492   0.13334579  0.16123277 -0.03373904 -0.3964654  -0.390682
 -0.20166229 -0.32729792 -0.13315308 -0.23529265 -0.26276272 -0.23962762
 -0.45693395 -0.20961725]

Variance = [ 1.000005  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005
  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005]

Skewness = [-1.13265567 -0.99837376 -1.23096732 -0.17701892  1.73369199  3.81828478
  7.50159872  2.61068979  1.1083718   1.53662083  1.58038185  1.21605645
  6.18547305 -7.03908282]

Kurtosis = [   1.57114747    1.21442682    2.14380763   -0.08684285    2.89426892
   17.00858034   64.01137776   10.31285653    1.91229596    2.5101101
    4.07792073    2.15174142   77.17391705  330.71488398]

Checking for missing/NaN values

In [10]:
print np.sum(np.equal(X_train_scaled, None))
print np.sum(np.isnan(X_train_scaled))
0
0

No missing or NaN values

Plotting

Stacked bar chart - each feature vs. target

In [13]:
pjkt.bar_plots(X_train_scaled, Y_train)
Y = 0 is red, Y = 1 is blue

Pairwise plots

In [14]:
pjkt.pair_plots(X_train_scaled, Y_train, 0, 1)
pjkt.pair_plots(X_train_scaled, Y_train, 0, 2)
# Toggle to view output
Y = 0 is red, Y = 1 is green
Y = 0 is red, Y = 1 is green

Pairwise plots with Pandas - experiment (warning - takes a lot of time)

In [43]:
# Convert numpy array into a pandas dataframe
temp = pd.DataFrame(X_train) 
# Can change this to 0:14 to get entire dataframe; takes a lot of time

axes = pd.tools.plotting.scatter_matrix(temp)
plt.tight_layout()

Data splits for training and validation

In [11]:
xtr_sub, xte_sub, ytr_sub, yte_sub = pjkt.data_seq_split(X_train, Y_train, 10000)
print xtr_sub.shape, xte_sub.shape, ytr_sub.shape, yte_sub.shape
(10000, 14) (190000, 14) (10000,) (190000,)

Alternatively, we could use ml.splitData(X, Y, train_fraction). ml.crossValidate() can be used for cross validation.

Error plotting

In [12]:
e1 = np.random.normal(size=10)
e2 = np.random.normal(size=10)
k = np.arange(10) + 1
pjkt.plot_errors(e1, e2, k)
In [256]:
def plot_errors_semilog(errTrain, errValidate, x_axis, semi = 'semilogx'):
    # Plot training vs. validation error
    figure, axis1 = plt.subplots() 
    axis1.set_ylabel('Error')
    if(semi == 'semilogx'):
        axis1.semilogx(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
        axis1.semilogx(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
    else:
        axis1.semilogy(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
        axis1.semilogy(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
    plt.title('Train (red) vs. Validation (green) error')
    plt.show()
In [13]:
pjkt.plot_errors_semilog(e1, e2, k, 'semilogx')

from random import randint
e1 = [randint(10, 200) for i in range(10)]
e2 = [randint(10, 200) for i in range(10)]
pjkt.plot_errors_semilog(e1, e2, k, 'semilogy')

View variables, modules, functions defined so far

In [14]:
whos
Variable         Type              Data/Info
--------------------------------------------
X_test           ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
X_train          ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
X_train_scaled   ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
Y_train          ndarray           200000: 200000 elems, type `float64`, 1600000 bytes (1 Mb)
e1               list              n=10
e2               list              n=10
i                int               9
k                ndarray           10: 10 elems, type `int64`, 80 bytes
ml               module            <module 'mltools' from 'mltools/__init__.pyc'>
np               module            <module 'numpy' from '/Us<...>ages/numpy/__init__.pyc'>
pd               module            <module 'pandas' from '/U<...>ges/pandas/__init__.pyc'>
pjkt             module            <module 'project_utils' from 'project_utils.pyc'>
plt              module            <module 'matplotlib.pyplo<...>s/matplotlib/pyplot.pyc'>
randint          instancemethod    <bound method Random.rand<...>m object at 0x10085d620>>
stats            module            <module 'scipy.stats' fro<...>cipy/stats/__init__.pyc'>
xte_sub          ndarray           190000x14: 2660000 elems, type `float64`, 21280000 bytes (20 Mb)
xtr_sub          ndarray           10000x14: 140000 elems, type `float64`, 1120000 bytes (1 Mb)
y_unique         ndarray           2: 2 elems, type `float64`, 16 bytes
yte_sub          ndarray           190000: 190000 elems, type `float64`, 1520000 bytes (1 Mb)
ytr_sub          ndarray           10000: 10000 elems, type `float64`, 80000 bytes
In [ ]:
 

Model building

Experiment 1 - SVM

In [750]:
from sklearn import svm
In [881]:
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 200000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
(150000, 14) (50000, 14) (150000,) (50000,)

Bare-bones SVM with class weighting

In [757]:
## learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7}, probability=True) - Submitted to Kaggle: 0.64796
learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7})#, probability=True) 
# Performs better on training and validation - not on kaggle 0.64470
learner.fit(xtr_sub, ytr_sub)
Out[757]:
SVC(C=1.0, cache_size=200, class_weight={0.0: 0.5, 1.0: 0.7}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [760]:
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
#ytest_pred_soft = learner.predict_proba(X_test_scaled)
In [762]:
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub), '\n'
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])

print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])
Training error:  0.3034
Validation error:  0.3149 

7240 12760
4662 15338
3675 6325
2266 7734
In [11]:
# Save the predictions in the format required by Kaggle - weights 0.5, 0.7
#np.savetxt('Yhat_svm_test.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
#                                          ytest_pred_soft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64796

# Weights 0.5, 0.6
np.savetxt('Yhat_svm_test2.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

To do - Vary:
a) Amount of data used for training and validation
b) Features, selection method and their transforms
c) Kernel
d) Regularization parameters
e) Plot errors as function of above Upload on Kaggle

Roc curve

In [766]:
y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)

from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(ytr_sub, y_score)
print metrics.roc_auc_score(ytr_sub, ytr_pred)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()
0.633072533296
Out[766]:
[<matplotlib.lines.Line2D at 0x1e4b27ed0>]

Balanced - SVM

In [27]:
learner = svm.SVC(class_weight = 'balanced')
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
Out[27]:
SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [94]:
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print float(len(ytr_sub[ytr_sub==1])/float((len(ytr_sub[ytr_sub==0]))))
7240 12760
3675 6325
0.567398119122

Subset of features

In [34]:
featlist = [0, 6, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
Training error:  0.35305
Validation error:  0.3611
In [38]:
# feature 7 has a lot of zeros
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
Training error:  0.3018
Validation error:  0.3161

Explicit balanced sampling

In [ ]:
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)

print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.SVC()
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
In [83]:
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
10915 10915
7367 7367

Linear SVC

In [751]:
np.random.seed(0)
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 50000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False) # Balanced weighting not working out
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
(20000, 14) (10000, 14) (20000,) (10000,)
Training error:  0.31685
Validation error:  0.326

Subset of features

In [41]:
featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 20000, 40000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
(20000, 14) (20000, 14) (20000,) (20000,)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-41-9a1dcfaa70f3> in <module>()
      7 print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
      8 
----> 9 learner = svm.LinearSVC(dual=False)
     10 learner.fit(xtr_sub[:,featlist] , ytr_sub)
     11 ytr_pred = learner.predict(xtr_sub[:,featlist])

NameError: name 'svm' is not defined

Explicit balanced sampling

In [71]:
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:40000,], Y_train[:40000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[40000:50000,], Y_train[40000:50000,], 1.0)

print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),
(29190, 14) (7374, 14) (29190,) (7374,)
Training error:  0.390167865707
Validation error:  0.39042582045
In [ ]:
 
In [ ]:
 

Experiment 2 - Neural Networks

In [202]:
from sklearn.neural_network import MLPClassifier
In [37]:
featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
(40000, 14) (30000, 14) (40000,) (30000,)
Training error:  0.309525
Validation error:  0.3188
In [33]:
ytest_pred_soft = learner.predict_proba(X_test_scaled)
In [35]:
# Kaggle for - 20k training, 10k validation
# learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
np.savetxt('Yhat_nn_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

# ~0.60 kaggle score 

Explicit balanced sampling

In [42]:
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
(21830, 14) (14734, 14) (21830,) (14734,)
Training error:  0.306825469537
Validation error:  0.358694176734
In [ ]:
 

Miscellaneous

Clustering

In [147]:
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
(40000, 14) (30000, 14) (40000,) (30000,)
In [88]:
num_clusters = [2]
#initial = ['random', 'farthest', 'k++'] # Different initialization methods
initial = ['k++']
np.random.seed(0)

for i, k in enumerate(num_clusters):
    for j in range(len(initial)):
        z, c, sumd = ml.cluster.kmeans(X = xtr_sub, K = k, init = initial[j])
        print str(k), " clusters with initialization:", initial[j]
        print "Sum of squared Euclidean distances:", str(sumd)
        
print "Ratio of 1s and 0s (training) = ",len(z[z==1]), len(z[z==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)
2  clusters with initialization: random
Sum of squared Euclidean distances: 400550.831045
In [190]:
num_clusters = [2]
initial = ['k++']
np.random.seed(0)
featlist = [0, 13, 8, 2, 12]

for i, k in enumerate(num_clusters):
    for j in range(len(initial)):
        #z, T, soft, ll = ml.cluster.gmmEM(X = xtr_sub[:,featlist], K = k, init = initial[j])
        #zval, T, soft, llval = ml.cluster.gmmEM(X = xval_sub[:,featlist], K = k, init = initial[j])
        ztest, T, soft, lltest = ml.cluster.gmmEM(X = X_test_scaled, K = k, init = initial[j])
        print str(k), " clusters with initialization: ", initial[j]
        print "Log likelihood (training): ", str(ll), "\n"
        print "Log likelihood (validation): ", str(llval), "\n"
z = 1 - z
zval = 1 - zval
ztest = 1 - ztest
#zval[zval==1] = 0
#zval[zval==0] = 1
        
print "Ratio of 1s and 0s = ",len(z[z==1]), len(z[z==0])
print "Ratio of 1s and 0s in original training= ",len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)
print "Ratio of 1s and 0s (validation) = ",len(zval[zval==1]), len(zval[zval==0])
print "Ratio of 1s and 0s in original validation= ",len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print "Validation error: ", float(np.sum(zval != yval_sub))/len(yval_sub)
2  clusters with initialization:  k++
Log likelihood (training):  749677.42686 

Log likelihood (validation):  543676.114293 

Ratio of 1s and 0s =  29574 10426
Ratio of 1s and 0s in original training=  14595 25405
Training error:  0.644475
Ratio of 1s and 0s (validation) =  22089 7911
Ratio of 1s and 0s in original validation=  11018 18982
Validation error:  0.6421
In [182]:
# Append the clusters to the training data
check = np.reshape(z, newshape=(len(z), 1))
print check.shape
xtr_new = np.append(xtr_sub, check, 1)
print xtr_new.shape

check = np.reshape(zval, newshape=(len(zval), 1))
print check.shape
xval_new = np.append(xval_sub, check, 1)
print xval_new.shape
(40000, 1)
(40000, 15)
(30000, 1)
(30000, 15)
In [231]:
# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
learner = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_new[:,featlist])
yval_pred = learner.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
Training error:  0.2987
Validation error:  0.310533333333
In [233]:
# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
learner2 = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner2.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner2.predict(xtr_new[:,featlist])
yval_pred = learner2.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)
Training error:  0.298975
Validation error:  0.3093
In [235]:
featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
check = np.reshape(ztest, newshape=(len(ztest), 1))
print check.shape
xtest_new = np.append(X_test_scaled[:, featlist], check, 1)
print xtest_new.shape
ytest_pred_soft = learner2.predict_proba(xtest_new)
(200000, 1)
(200000, 15)
In [236]:
# learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
np.savetxt('Yhat_nncluster_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle - 0.64715
In [ ]:
 

Feature transformations

In [129]:
for i in range(14):
    plt.scatter(xtr_new[:,i], ytr_sub)
    plt.title('Feature '+str(i+1))
    plt.show()

#plt.scatter((X_train_scaled[:,0]), Y_train)
In [142]:
plt.scatter(np.exp(220 + xtr_new[:,6]), ytr_sub)
Out[142]:
<matplotlib.collections.PathCollection at 0x13161ef90>
In [136]:
corrList = []
for i in range(14):
    print "Feature ", i
    print np.corrcoef(xtr_new[:, i], ytr_sub)
Feature  0
[[ 1.         -0.22952349]
 [-0.22952349  1.        ]]
Feature  1
[[ 1.         -0.09297603]
 [-0.09297603  1.        ]]
Feature  2
[[ 1.         -0.11647034]
 [-0.11647034  1.        ]]
Feature  3
[[ 1.         -0.10206507]
 [-0.10206507  1.        ]]
Feature  4
[[ 1.          0.02198886]
 [ 0.02198886  1.        ]]
Feature  5
[[ 1.          0.05888135]
 [ 0.05888135  1.        ]]
Feature  6
[[ 1.         -0.00451683]
 [-0.00451683  1.        ]]
Feature  7
[[ 1.         -0.03667107]
 [-0.03667107  1.        ]]
Feature  8
[[ 1.          0.08914196]
 [ 0.08914196  1.        ]]
Feature  9
[[ 1.          0.07457092]
 [ 0.07457092  1.        ]]
Feature  10
[[ 1.          0.02064326]
 [ 0.02064326  1.        ]]
Feature  11
[[ 1.          0.08576368]
 [ 0.08576368  1.        ]]
Feature  12
[[ 1.         -0.10108739]
 [-0.10108739  1.        ]]
Feature  13
[[ 1.          0.10679203]
 [ 0.10679203  1.        ]]
In [ ]:
 

Graphlab

In [238]:
import graphlab
In [299]:
sf_xtrain = graphlab.SFrame.read_csv('data/X_train.txt', header = False, delimiter=' ')
sf_xtest = graphlab.SFrame.read_csv('data/X_test.txt', header=False, delimiter=' ')
sf_ytrain = graphlab.SFrame.read_csv('data/Y_train.txt', header=False, delimiter=' ')
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_train.txt
Parsing completed. Parsed 100 lines in 0.843793 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[float,float,float,float,float,float,float,float,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Read 149796 lines. Lines per second: 155526
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_train.txt
Parsing completed. Parsed 200000 lines in 1.04887 secs.
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_test.txt
Parsing completed. Parsed 100 lines in 0.808939 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[float,float,float,float,float,float,float,float,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_test.txt
Parsing completed. Parsed 200000 lines in 1.27885 secs.
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/Y_train.txt
Parsing completed. Parsed 100 lines in 0.044827 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/Y_train.txt
Parsing completed. Parsed 200000 lines in 0.053906 secs.
In [300]:
sf_xtrain.print_rows
Out[300]:
<bound method SFrame.print_rows of Columns:
	X1	float
	X2	float
	X3	float
	X4	float
	X5	float
	X6	float
	X7	float
	X8	float
	X9	float
	X10	float
	X11	float
	X12	float
	X13	float
	X14	float

Rows: 200000

Data:
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+
|   X10   |  X11   |  X12   |  X13   | X14  |
+---------+--------+--------+--------+------+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 |
+---------+--------+--------+--------+------+
[200000 rows x 14 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>
In [301]:
graphlab.canvas.set_target('browser')
In [266]:
sf_xtrain.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [302]:
sf_alltrain = sf_xtrain.add_column(graphlab.SArray(Y_train))
In [303]:
sf_alltrain.print_rows
Out[303]:
<bound method SFrame.print_rows of Columns:
	X1	float
	X2	float
	X3	float
	X4	float
	X5	float
	X6	float
	X7	float
	X8	float
	X9	float
	X10	float
	X11	float
	X12	float
	X13	float
	X14	float
	X15	float

Rows: 200000

Data:
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+-----+
|   X10   |  X11   |  X12   |  X13   | X14  | X15 |
+---------+--------+--------+--------+------+-----+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  | 1.0 |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  | 0.0 |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  | 1.0 |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  | 1.0 |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  | 0.0 |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  | 0.0 |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 | 1.0 |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  | 0.0 |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  | 0.0 |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 | 0.0 |
+---------+--------+--------+--------+------+-----+
[200000 rows x 15 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>
In [304]:
sf_alltrain['X15'] = sf_alltrain['X15'].astype(int)
In [307]:
sf_subtrain = sf_alltrain[0:100000]
sf_subval = sf_alltrain[100000:150000]
print sf_subtrain.print_rows()
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+-----+
|   X10   |  X11   |  X12   |  X13   | X14  | X15 |
+---------+--------+--------+--------+------+-----+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  |  1  |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  |  0  |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  |  1  |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  |  1  |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  |  0  |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  |  0  |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 |  1  |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  |  0  |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  |  0  |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 |  0  |
+---------+--------+--------+--------+------+-----+
[100000 rows x 15 columns]

None

Neural Network

In [308]:
nn = graphlab.neuralnet_classifier.create(sf_subtrain, 'X15')
Using network:

### network layers ###
layer[0]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 10
layer[1]: SigmoidLayer
layer[2]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 2
layer[3]: SoftmaxLayer
### end network layers ###

### network parameters ###
learning_rate = 0.001
momentum = 0.9
### end network parameters ###

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Creating neuralnet using cpu
Training with batch size = 100
+-----------+----------+--------------+-------------------+---------------------+-----------------+
| Iteration | Examples | Elapsed Time | Training-accuracy | Validation-accuracy | Examples/second |
+-----------+----------+--------------+-------------------+---------------------+-----------------+
| 1         | 95000    | 0.218257     | 0.632200          | 0.632049            | 435267.031250   |
| 2         | 95000    | 0.404762     | 0.632568          | 0.632049            | 509557.218750   |
| 3         | 95000    | 0.592937     | 0.632568          | 0.632049            | 505052.437500   |
| 4         | 95000    | 0.783011     | 0.632568          | 0.632049            | 499959.375000   |
| 5         | 95000    | 0.976144     | 0.632568          | 0.632049            | 492043.875000   |
| 6         | 95000    | 1.162144     | 0.632568          | 0.632049            | 510904.125000   |
| 7         | 95000    | 1.347743     | 0.632568          | 0.632049            | 512015.375000   |
| 8         | 95000    | 1.547084     | 0.632568          | 0.632049            | 476708.812500   |
| 9         | 95000    | 1.736059     | 0.632568          | 0.632049            | 502908.031250   |
| 10        | 95000    | 1.924420     | 0.632568          | 0.632049            | 504504.093750   |
+-----------+----------+--------------+-------------------+---------------------+-----------------+
In [322]:
pred = nn.classify(sf_subtrain)
pred.print_rows(1000)
+--------+-------+----------------+
| row_id | class |  probability   |
+--------+-------+----------------+
|   0    |   0   | 0.601708769798 |
|   1    |   0   | 0.601708769798 |
|   2    |   0   | 0.601708769798 |
|   3    |   0   | 0.601708769798 |
|   4    |   0   | 0.601708769798 |
|   5    |   0   | 0.601708769798 |
|   6    |   0   | 0.601708769798 |
|   7    |   0   | 0.601708769798 |
|   8    |   0   | 0.601708769798 |
|   9    |   0   | 0.601708769798 |
|   10   |   0   | 0.601708769798 |
|   11   |   0   | 0.601708769798 |
|   12   |   0   | 0.601708769798 |
|   13   |   0   | 0.601708769798 |
|   14   |   0   | 0.601708769798 |
|   15   |   0   | 0.601708769798 |
|   16   |   0   | 0.601707875729 |
|   17   |   0   | 0.601708769798 |
|   18   |   0   | 0.601708769798 |
|   19   |   0   | 0.601708769798 |
|   20   |   0   | 0.601708769798 |
|   21   |   0   | 0.692799866199 |
|   22   |   0   | 0.601708769798 |
|   23   |   0   | 0.601708769798 |
|   24   |   0   | 0.601708769798 |
|   25   |   0   | 0.601708769798 |
|   26   |   0   | 0.601708769798 |
|   27   |   0   | 0.65694963932  |
|   28   |   0   | 0.601708769798 |
|   29   |   0   | 0.601710140705 |
|   30   |   0   | 0.601708769798 |
|   31   |   0   | 0.601708769798 |
|   32   |   0   | 0.703370630741 |
|   33   |   0   | 0.601708769798 |
|   34   |   0   | 0.601708769798 |
|   35   |   0   | 0.601708769798 |
|   36   |   0   | 0.601708769798 |
|   37   |   0   | 0.601708769798 |
|   38   |   0   | 0.601708769798 |
|   39   |   0   | 0.601708769798 |
|   40   |   0   | 0.601708710194 |
|   41   |   0   | 0.601708769798 |
|   42   |   0   | 0.601708769798 |
|   43   |   0   | 0.601708769798 |
|   44   |   0   | 0.601708769798 |
|   45   |   0   | 0.705749750137 |
|   46   |   0   | 0.601708769798 |
|   47   |   0   | 0.601708769798 |
|   48   |   0   | 0.601708769798 |
|   49   |   0   | 0.601708769798 |
|   50   |   0   | 0.601708769798 |
|   51   |   0   | 0.601708769798 |
|   52   |   0   | 0.601708769798 |
|   53   |   0   | 0.601708769798 |
|   54   |   0   | 0.601708769798 |
|   55   |   0   | 0.601708650589 |
|   56   |   0   | 0.601708769798 |
|   57   |   0   | 0.601708710194 |
|   58   |   0   | 0.601708769798 |
|   59   |   0   | 0.601708769798 |
|   60   |   0   | 0.601708710194 |
|   61   |   0   | 0.601708769798 |
|   62   |   0   | 0.710985302925 |
|   63   |   0   | 0.601708769798 |
|   64   |   0   | 0.601708769798 |
|   65   |   0   | 0.71114051342  |
|   66   |   0   | 0.601708769798 |
|   67   |   0   | 0.711132824421 |
|   68   |   0   | 0.601708769798 |
|   69   |   0   | 0.601708769798 |
|   70   |   0   | 0.601708769798 |
|   71   |   0   | 0.601708769798 |
|   72   |   0   | 0.601708769798 |
|   73   |   0   | 0.601708769798 |
|   74   |   0   | 0.601708769798 |
|   75   |   0   | 0.675784647465 |
|   76   |   0   | 0.601708769798 |
|   77   |   0   | 0.60205757618  |
|   78   |   0   | 0.601708769798 |
|   79   |   0   | 0.711086153984 |
|   80   |   0   | 0.601708769798 |
|   81   |   0   | 0.601708769798 |
|   82   |   0   | 0.601708769798 |
|   83   |   0   | 0.601708769798 |
|   84   |   0   | 0.601708769798 |
|   85   |   0   | 0.710575640202 |
|   86   |   0   | 0.601708769798 |
|   87   |   0   | 0.601708769798 |
|   88   |   0   | 0.601708769798 |
|   89   |   0   | 0.601708769798 |
|   90   |   0   | 0.601708769798 |
|   91   |   0   | 0.601708769798 |
|   92   |   0   | 0.601705908775 |
|   93   |   0   | 0.601708769798 |
|   94   |   0   | 0.601708769798 |
|   95   |   0   | 0.601708769798 |
|   96   |   0   | 0.601862430573 |
|   97   |   0   | 0.711144387722 |
|   98   |   0   | 0.601708769798 |
|   99   |   0   | 0.601708769798 |
|  100   |   0   | 0.601708769798 |
|  101   |   0   | 0.601708769798 |
|  102   |   0   | 0.601708769798 |
|  103   |   0   | 0.601708769798 |
|  104   |   0   | 0.708188951015 |
|  105   |   0   | 0.601708710194 |
|  106   |   0   | 0.601708769798 |
|  107   |   0   | 0.601708769798 |
|  108   |   0   | 0.602389514446 |
|  109   |   0   | 0.709271371365 |
|  110   |   0   | 0.601708769798 |
|  111   |   0   | 0.601708769798 |
|  112   |   0   | 0.601708769798 |
|  113   |   0   | 0.601708769798 |
|  114   |   0   | 0.601708590984 |
|  115   |   0   | 0.601708769798 |
|  116   |   0   | 0.601708769798 |
|  117   |   0   | 0.601708650589 |
|  118   |   0   | 0.601708769798 |
|  119   |   0   | 0.601708769798 |
|  120   |   0   | 0.711139559746 |
|  121   |   0   | 0.711151242256 |
|  122   |   0   | 0.601708769798 |
|  123   |   0   | 0.601708769798 |
|  124   |   0   | 0.601708769798 |
|  125   |   0   | 0.601708769798 |
|  126   |   0   | 0.601708769798 |
|  127   |   0   | 0.601708710194 |
|  128   |   0   | 0.601708769798 |
|  129   |   0   | 0.601708769798 |
|  130   |   0   | 0.601707100868 |
|  131   |   0   | 0.601708769798 |
|  132   |   0   | 0.711142420769 |
|  133   |   0   | 0.626559317112 |
|  134   |   0   | 0.601708769798 |
|  135   |   0   | 0.601708769798 |
|  136   |   0   | 0.601708769798 |
|  137   |   0   | 0.601708769798 |
|  138   |   0   | 0.605115175247 |
|  139   |   0   | 0.601708710194 |
|  140   |   0   | 0.601708769798 |
|  141   |   0   | 0.601708710194 |
|  142   |   0   | 0.601708769798 |
|  143   |   0   | 0.601708769798 |
|  144   |   0   | 0.601708769798 |
|  145   |   0   | 0.601708650589 |
|  146   |   0   | 0.601708769798 |
|  147   |   0   | 0.601708769798 |
|  148   |   0   | 0.601708769798 |
|  149   |   0   | 0.602511882782 |
|  150   |   0   | 0.663577258587 |
|  151   |   0   | 0.601708769798 |
|  152   |   0   | 0.601708710194 |
|  153   |   0   | 0.699935853481 |
|  154   |   0   | 0.711049556732 |
|  155   |   0   | 0.601708769798 |
|  156   |   0   | 0.601706504822 |
|  157   |   0   | 0.601708769798 |
|  158   |   0   | 0.601708769798 |
|  159   |   0   | 0.606579899788 |
|  160   |   0   | 0.601708769798 |
|  161   |   0   | 0.601708769798 |
|  162   |   0   | 0.601708769798 |
|  163   |   0   | 0.601708769798 |
|  164   |   0   | 0.601708710194 |
|  165   |   0   | 0.601708769798 |
|  166   |   0   | 0.601708769798 |
|  167   |   0   | 0.601708769798 |
|  168   |   0   | 0.601708769798 |
|  169   |   0   | 0.601708769798 |
|  170   |   0   | 0.60170841217  |
|  171   |   0   | 0.601708769798 |
|  172   |   0   | 0.601708769798 |
|  173   |   0   | 0.601708769798 |
|  174   |   0   | 0.601708769798 |
|  175   |   0   | 0.60171097517  |
|  176   |   0   | 0.601708769798 |
|  177   |   0   | 0.601708590984 |
|  178   |   0   | 0.601708769798 |
|  179   |   0   | 0.601708054543 |
|  180   |   0   | 0.601708769798 |
|  181   |   0   | 0.601708769798 |
|  182   |   0   | 0.60170841217  |
|  183   |   0   | 0.601708769798 |
|  184   |   0   | 0.644745886326 |
|  185   |   0   | 0.601708769798 |
|  186   |   0   | 0.601708769798 |
|  187   |   0   | 0.71114385128  |
|  188   |   0   | 0.601708769798 |
|  189   |   0   | 0.601708769798 |
|  190   |   0   | 0.601707518101 |
|  191   |   0   | 0.707729458809 |
|  192   |   0   | 0.601708769798 |
|  193   |   0   | 0.601708769798 |
|  194   |   0   | 0.601708769798 |
|  195   |   0   | 0.601708769798 |
|  196   |   0   | 0.601708769798 |
|  197   |   0   | 0.601708769798 |
|  198   |   0   | 0.601708769798 |
|  199   |   0   | 0.601708769798 |
|  200   |   0   | 0.601708769798 |
|  201   |   0   | 0.601708769798 |
|  202   |   0   | 0.601708769798 |
|  203   |   0   | 0.601708769798 |
|  204   |   0   | 0.601708769798 |
|  205   |   0   | 0.601708769798 |
|  206   |   0   | 0.601708769798 |
|  207   |   0   | 0.601708769798 |
|  208   |   0   | 0.601708769798 |
|  209   |   0   | 0.709246098995 |
|  210   |   0   | 0.601708769798 |
|  211   |   0   | 0.601708769798 |
|  212   |   0   | 0.601708769798 |
|  213   |   0   | 0.601708769798 |
|  214   |   0   | 0.601708769798 |
|  215   |   0   | 0.601708769798 |
|  216   |   0   | 0.601708769798 |
|  217   |   0   | 0.710933506489 |
|  218   |   0   | 0.601708769798 |
|  219   |   0   | 0.711134016514 |
|  220   |   0   | 0.601708769798 |
|  221   |   0   | 0.601708769798 |
|  222   |   0   | 0.601708769798 |
|  223   |   0   | 0.601708769798 |
|  224   |   0   | 0.601708769798 |
|  225   |   0   | 0.601708769798 |
|  226   |   0   | 0.601708769798 |
|  227   |   0   | 0.601708710194 |
|  228   |   0   | 0.601708769798 |
|  229   |   0   | 0.601708769798 |
|  230   |   0   | 0.601708769798 |
|  231   |   0   | 0.601708769798 |
|  232   |   0   | 0.601708769798 |
|  233   |   0   | 0.601708769798 |
|  234   |   0   | 0.601708769798 |
|  235   |   0   | 0.601708769798 |
|  236   |   0   | 0.601708769798 |
|  237   |   0   | 0.601708769798 |
|  238   |   0   | 0.601708769798 |
|  239   |   0   | 0.601708769798 |
|  240   |   0   | 0.601708769798 |
|  241   |   0   | 0.601708769798 |
|  242   |   0   | 0.601708769798 |
|  243   |   0   | 0.601708769798 |
|  244   |   0   | 0.601708769798 |
|  245   |   0   | 0.601708769798 |
|  246   |   0   | 0.601708590984 |
|  247   |   0   | 0.601708769798 |
|  248   |   0   | 0.601803898811 |
|  249   |   0   | 0.711138010025 |
|  250   |   0   | 0.601708769798 |
|  251   |   0   | 0.601708769798 |
|  252   |   0   | 0.601708769798 |
|  253   |   0   | 0.711127877235 |
|  254   |   0   | 0.601708769798 |
|  255   |   0   | 0.601708769798 |
|  256   |   0   | 0.601708769798 |
|  257   |   0   | 0.601708769798 |
|  258   |   0   | 0.601708769798 |
|  259   |   0   | 0.601708769798 |
|  260   |   0   | 0.601708769798 |
|  261   |   0   | 0.601708769798 |
|  262   |   0   | 0.601708769798 |
|  263   |   0   | 0.601708769798 |
|  264   |   0   | 0.706897616386 |
|  265   |   0   | 0.601708769798 |
|  266   |   0   | 0.601708769798 |
|  267   |   0   | 0.601708769798 |
|  268   |   0   | 0.601708769798 |
|  269   |   0   | 0.707054376602 |
|  270   |   0   | 0.711016118526 |
|  271   |   0   | 0.601708769798 |
|  272   |   0   | 0.601708769798 |
|  273   |   0   | 0.601708769798 |
|  274   |   0   | 0.601708769798 |
|  275   |   0   | 0.601708769798 |
|  276   |   0   | 0.601708769798 |
|  277   |   0   | 0.601708769798 |
|  278   |   0   | 0.601708769798 |
|  279   |   0   | 0.71108096838  |
|  280   |   0   | 0.601708769798 |
|  281   |   0   | 0.601708769798 |
|  282   |   0   | 0.601708769798 |
|  283   |   0   | 0.601708769798 |
|  284   |   0   | 0.601708769798 |
|  285   |   0   | 0.601708769798 |
|  286   |   0   | 0.601708769798 |
|  287   |   0   | 0.601708769798 |
|  288   |   0   | 0.601708769798 |
|  289   |   0   | 0.711139380932 |
|  290   |   0   | 0.601708769798 |
|  291   |   0   | 0.601708769798 |
|  292   |   0   | 0.601708769798 |
|  293   |   0   | 0.601708769798 |
|  294   |   0   | 0.601708769798 |
|  295   |   0   | 0.601708769798 |
|  296   |   0   | 0.601708710194 |
|  297   |   0   | 0.601708710194 |
|  298   |   0   | 0.601708769798 |
|  299   |   0   | 0.71088296175  |
|  300   |   0   | 0.601708769798 |
|  301   |   0   | 0.601708769798 |
|  302   |   0   | 0.601708769798 |
|  303   |   0   | 0.601707935333 |
|  304   |   0   | 0.601708769798 |
|  305   |   0   | 0.601708769798 |
|  306   |   0   | 0.601708769798 |
|  307   |   0   | 0.601708292961 |
|  308   |   0   | 0.601708769798 |
|  309   |   0   | 0.601711750031 |
|  310   |   0   | 0.601708769798 |
|  311   |   0   | 0.601708769798 |
|  312   |   0   | 0.601708769798 |
|  313   |   0   | 0.601708769798 |
|  314   |   0   | 0.601708769798 |
|  315   |   0   | 0.601708769798 |
|  316   |   0   | 0.616813957691 |
|  317   |   0   | 0.601708769798 |
|  318   |   0   | 0.601708769798 |
|  319   |   0   | 0.601708769798 |
|  320   |   0   | 0.601708710194 |
|  321   |   0   | 0.709868729115 |
|  322   |   0   | 0.601708769798 |
|  323   |   0   | 0.601708769798 |
|  324   |   0   | 0.601708769798 |
|  325   |   0   | 0.601708769798 |
|  326   |   0   | 0.621520102024 |
|  327   |   0   | 0.601708769798 |
|  328   |   0   | 0.601708769798 |
|  329   |   0   | 0.601708710194 |
|  330   |   0   | 0.601708769798 |
|  331   |   0   | 0.601708769798 |
|  332   |   0   | 0.601708769798 |
|  333   |   0   | 0.711146652699 |
|  334   |   0   | 0.601708769798 |
|  335   |   0   | 0.601708769798 |
|  336   |   0   | 0.601708769798 |
|  337   |   0   | 0.601708769798 |
|  338   |   0   | 0.601708710194 |
|  339   |   0   | 0.601708769798 |
|  340   |   0   | 0.601708769798 |
|  341   |   0   | 0.601708769798 |
|  342   |   0   | 0.699524462223 |
|  343   |   0   | 0.601708769798 |
|  344   |   0   | 0.601708769798 |
|  345   |   0   | 0.601708710194 |
|  346   |   0   | 0.601708769798 |
|  347   |   0   | 0.601708769798 |
|  348   |   0   | 0.601708769798 |
|  349   |   0   | 0.601769685745 |
|  350   |   0   | 0.601708769798 |
|  351   |   0   | 0.608652412891 |
|  352   |   0   | 0.612375736237 |
|  353   |   0   | 0.601796865463 |
|  354   |   0   | 0.601708769798 |
|  355   |   0   | 0.601708590984 |
|  356   |   0   | 0.601708769798 |
|  357   |   0   | 0.601708769798 |
|  358   |   0   | 0.601708769798 |
|  359   |   0   | 0.601708769798 |
|  360   |   0   | 0.601708650589 |
|  361   |   0   | 0.601708650589 |
|  362   |   0   | 0.601708769798 |
|  363   |   0   | 0.601708769798 |
|  364   |   0   | 0.601707994938 |
|  365   |   0   | 0.601708769798 |
|  366   |   0   | 0.601708769798 |
|  367   |   0   | 0.601708769798 |
|  368   |   0   | 0.601707756519 |
|  369   |   0   | 0.601708769798 |
|  370   |   0   | 0.601708769798 |
|  371   |   0   | 0.601708769798 |
|  372   |   0   | 0.601708769798 |
|  373   |   0   | 0.601708769798 |
|  374   |   0   | 0.601708769798 |
|  375   |   0   | 0.690859675407 |
|  376   |   0   | 0.601708769798 |
|  377   |   0   | 0.601708769798 |
|  378   |   0   | 0.607296705246 |
|  379   |   0   | 0.601707339287 |
|  380   |   0   | 0.601708769798 |
|  381   |   0   | 0.601708590984 |
|  382   |   0   | 0.711107492447 |
|  383   |   0   | 0.689677238464 |
|  384   |   0   | 0.601708769798 |
|  385   |   0   | 0.601708769798 |
|  386   |   0   | 0.601708769798 |
|  387   |   0   | 0.601708769798 |
|  388   |   0   | 0.601708769798 |
|  389   |   0   | 0.601708769798 |
|  390   |   0   | 0.601708769798 |
|  391   |   0   | 0.601708769798 |
|  392   |   0   | 0.601708769798 |
|  393   |   0   | 0.601706385612 |
|  394   |   0   | 0.601708769798 |
|  395   |   0   | 0.711059331894 |
|  396   |   0   | 0.601708769798 |
|  397   |   0   | 0.601708769798 |
|  398   |   0   | 0.601708769798 |
|  399   |   0   | 0.614991128445 |
|  400   |   0   | 0.710083663464 |
|  401   |   0   | 0.601708769798 |
|  402   |   0   | 0.601708769798 |
|  403   |   0   | 0.601708769798 |
|  404   |   0   | 0.601708769798 |
|  405   |   0   | 0.601718068123 |
|  406   |   0   | 0.601708769798 |
|  407   |   0   | 0.601708769798 |
|  408   |   0   | 0.710230231285 |
|  409   |   0   | 0.601708769798 |
|  410   |   0   | 0.601708769798 |
|  411   |   0   | 0.709443330765 |
|  412   |   0   | 0.601708769798 |
|  413   |   0   | 0.711143732071 |
|  414   |   0   | 0.601711809635 |
|  415   |   0   | 0.601708710194 |
|  416   |   0   | 0.601708769798 |
|  417   |   0   | 0.601708769798 |
|  418   |   0   | 0.601708769798 |
|  419   |   0   | 0.601708769798 |
|  420   |   0   | 0.601708710194 |
|  421   |   0   | 0.601708769798 |
|  422   |   0   | 0.601708769798 |
|  423   |   0   | 0.601708769798 |
|  424   |   0   | 0.601708769798 |
|  425   |   0   | 0.601708769798 |
|  426   |   0   | 0.601708769798 |
|  427   |   0   | 0.650603353977 |
|  428   |   0   | 0.601708710194 |
|  429   |   0   | 0.601708769798 |
|  430   |   0   | 0.601708710194 |
|  431   |   0   | 0.601708769798 |
|  432   |   0   | 0.601708769798 |
|  433   |   0   | 0.601708769798 |
|  434   |   0   | 0.711120069027 |
|  435   |   0   | 0.601708769798 |
|  436   |   0   | 0.601708769798 |
|  437   |   0   | 0.601708769798 |
|  438   |   0   | 0.642663896084 |
|  439   |   0   | 0.601708769798 |
|  440   |   0   | 0.601708769798 |
|  441   |   0   | 0.601708769798 |
|  442   |   0   | 0.601708769798 |
|  443   |   0   | 0.601708769798 |
|  444   |   0   | 0.601708769798 |
|  445   |   0   | 0.601708769798 |
|  446   |   0   | 0.601708769798 |
|  447   |   0   | 0.601708769798 |
|  448   |   0   | 0.601708769798 |
|  449   |   0   | 0.706617474556 |
|  450   |   0   | 0.601708769798 |
|  451   |   0   | 0.601708471775 |
|  452   |   0   | 0.601708769798 |
|  453   |   0   | 0.601708769798 |
|  454   |   0   | 0.601708769798 |
|  455   |   0   | 0.601708769798 |
|  456   |   0   | 0.601708769798 |
|  457   |   0   | 0.610571682453 |
|  458   |   0   | 0.601708769798 |
|  459   |   0   | 0.601708769798 |
|  460   |   0   | 0.705986857414 |
|  461   |   0   | 0.71090555191  |
|  462   |   0   | 0.601708710194 |
|  463   |   0   | 0.601708769798 |
|  464   |   0   | 0.601708769798 |
|  465   |   0   | 0.601708769798 |
|  466   |   0   | 0.601708769798 |
|  467   |   0   | 0.710813343525 |
|  468   |   0   | 0.601708769798 |
|  469   |   0   | 0.601708769798 |
|  470   |   0   | 0.601711690426 |
|  471   |   0   | 0.601708769798 |
|  472   |   0   | 0.601708769798 |
|  473   |   0   | 0.602107048035 |
|  474   |   0   | 0.711129546165 |
|  475   |   0   | 0.610856354237 |
|  476   |   0   | 0.601708769798 |
|  477   |   0   | 0.601708769798 |
|  478   |   0   | 0.601708769798 |
|  479   |   0   | 0.601708769798 |
|  480   |   0   | 0.607582271099 |
|  481   |   0   | 0.601708769798 |
|  482   |   0   | 0.601708769798 |
|  483   |   0   | 0.601708769798 |
|  484   |   0   | 0.601708590984 |
|  485   |   0   | 0.601708471775 |
|  486   |   0   | 0.709988474846 |
|  487   |   0   | 0.601708769798 |
|  488   |   0   | 0.601708769798 |
|  489   |   0   | 0.601717352867 |
|  490   |   0   | 0.601708769798 |
|  491   |   0   | 0.601708769798 |
|  492   |   0   | 0.601708769798 |
|  493   |   0   | 0.601708769798 |
|  494   |   0   | 0.601708769798 |
|  495   |   0   | 0.601708769798 |
|  496   |   0   | 0.601708769798 |
|  497   |   0   | 0.601708650589 |
|  498   |   0   | 0.601708590984 |
|  499   |   0   | 0.601708769798 |
|  500   |   0   | 0.601708769798 |
|  501   |   0   | 0.601708769798 |
|  502   |   0   | 0.60170686245  |
|  503   |   0   | 0.601708769798 |
|  504   |   0   | 0.601708769798 |
|  505   |   0   | 0.601708769798 |
|  506   |   0   | 0.601708769798 |
|  507   |   0   | 0.601708769798 |
|  508   |   0   | 0.601708769798 |
|  509   |   0   | 0.601708769798 |
|  510   |   0   | 0.601708769798 |
|  511   |   0   | 0.611806094646 |
|  512   |   0   | 0.601708769798 |
|  513   |   0   | 0.601708769798 |
|  514   |   0   | 0.601708769798 |
|  515   |   0   | 0.601708233356 |
|  516   |   0   | 0.601708769798 |
|  517   |   0   | 0.601708769798 |
|  518   |   0   | 0.601708769798 |
|  519   |   0   | 0.601708769798 |
|  520   |   0   | 0.601708769798 |
|  521   |   0   | 0.601708769798 |
|  522   |   0   | 0.601708769798 |
|  523   |   0   | 0.601708769798 |
|  524   |   0   | 0.601708769798 |
|  525   |   0   | 0.601708710194 |
|  526   |   0   | 0.601708769798 |
|  527   |   0   | 0.709474563599 |
|  528   |   0   | 0.601708769798 |
|  529   |   0   | 0.601708769798 |
|  530   |   0   | 0.601708769798 |
|  531   |   0   | 0.601708710194 |
|  532   |   0   | 0.601708769798 |
|  533   |   0   | 0.601708769798 |
|  534   |   0   | 0.601708769798 |
|  535   |   0   | 0.602038681507 |
|  536   |   0   | 0.601708769798 |
|  537   |   0   | 0.601708590984 |
|  538   |   0   | 0.601708710194 |
|  539   |   0   | 0.601708769798 |
|  540   |   0   | 0.711128473282 |
|  541   |   0   | 0.601709008217 |
|  542   |   0   | 0.601708769798 |
|  543   |   0   | 0.601765036583 |
|  544   |   0   | 0.601708769798 |
|  545   |   0   | 0.601708769798 |
|  546   |   0   | 0.601708769798 |
|  547   |   0   | 0.601708769798 |
|  548   |   0   | 0.601708769798 |
|  549   |   0   | 0.601708769798 |
|  550   |   0   | 0.601708769798 |
|  551   |   0   | 0.601775765419 |
|  552   |   0   | 0.601708769798 |
|  553   |   0   | 0.601708710194 |
|  554   |   0   | 0.601708769798 |
|  555   |   0   | 0.601753652096 |
|  556   |   0   | 0.601708769798 |
|  557   |   0   | 0.601708710194 |
|  558   |   0   | 0.601708769798 |
|  559   |   0   | 0.601708769798 |
|  560   |   0   | 0.601708769798 |
|  561   |   0   | 0.601708769798 |
|  562   |   0   | 0.601708769798 |
|  563   |   0   | 0.601708769798 |
|  564   |   0   | 0.601708710194 |
|  565   |   0   | 0.601708769798 |
|  566   |   0   | 0.601708769798 |
|  567   |   0   | 0.711067378521 |
|  568   |   0   | 0.601708769798 |
|  569   |   0   | 0.601708769798 |
|  570   |   0   | 0.601708769798 |
|  571   |   0   | 0.601708769798 |
|  572   |   0   | 0.601708769798 |
|  573   |   0   | 0.601708769798 |
|  574   |   0   | 0.71113806963  |
|  575   |   0   | 0.601708769798 |
|  576   |   0   | 0.798796117306 |
|  577   |   0   | 0.601708710194 |
|  578   |   0   | 0.601708710194 |
|  579   |   0   | 0.601708769798 |
|  580   |   0   | 0.601708769798 |
|  581   |   0   | 0.601713597775 |
|  582   |   0   | 0.601708769798 |
|  583   |   0   | 0.601708769798 |
|  584   |   0   | 0.711003243923 |
|  585   |   0   | 0.601708769798 |
|  586   |   0   | 0.601708769798 |
|  587   |   0   | 0.601708769798 |
|  588   |   0   | 0.601708769798 |
|  589   |   0   | 0.711139440536 |
|  590   |   0   | 0.711128413677 |
|  591   |   0   | 0.601708769798 |
|  592   |   0   | 0.601708769798 |
|  593   |   0   | 0.703078985214 |
|  594   |   0   | 0.601708769798 |
|  595   |   0   | 0.601708769798 |
|  596   |   0   | 0.601708769798 |
|  597   |   0   | 0.601708769798 |
|  598   |   0   | 0.601708769798 |
|  599   |   0   | 0.601708769798 |
|  600   |   0   | 0.601708769798 |
|  601   |   0   | 0.601708769798 |
|  602   |   0   | 0.601708769798 |
|  603   |   0   | 0.601708769798 |
|  604   |   0   | 0.601708769798 |
|  605   |   0   | 0.605908513069 |
|  606   |   0   | 0.601708769798 |
|  607   |   0   | 0.601707935333 |
|  608   |   0   | 0.601708769798 |
|  609   |   0   | 0.601708769798 |
|  610   |   0   | 0.601708769798 |
|  611   |   0   | 0.711041212082 |
|  612   |   0   | 0.601708769798 |
|  613   |   0   | 0.711107552052 |
|  614   |   0   | 0.601708769798 |
|  615   |   0   | 0.601708471775 |
|  616   |   0   | 0.601708769798 |
|  617   |   0   | 0.601708769798 |
|  618   |   0   | 0.601708769798 |
|  619   |   0   | 0.601708769798 |
|  620   |   0   | 0.710059523582 |
|  621   |   0   | 0.601708769798 |
|  622   |   0   | 0.601707935333 |
|  623   |   0   | 0.710216641426 |
|  624   |   0   | 0.696136534214 |
|  625   |   0   | 0.601708769798 |
|  626   |   0   | 0.601708769798 |
|  627   |   0   | 0.601707756519 |
|  628   |   0   | 0.601707756519 |
|  629   |   0   | 0.711080670357 |
|  630   |   0   | 0.601708769798 |
|  631   |   0   | 0.601708769798 |
|  632   |   0   | 0.601708650589 |
|  633   |   0   | 0.601708769798 |
|  634   |   0   | 0.709804236889 |
|  635   |   0   | 0.601708769798 |
|  636   |   0   | 0.601708769798 |
|  637   |   0   | 0.601708650589 |
|  638   |   0   | 0.605356097221 |
|  639   |   0   | 0.601706147194 |
|  640   |   0   | 0.601708769798 |
|  641   |   0   | 0.601708769798 |
|  642   |   0   | 0.601708769798 |
|  643   |   0   | 0.601708769798 |
|  644   |   0   | 0.601708769798 |
|  645   |   0   | 0.601708769798 |
|  646   |   0   | 0.601708769798 |
|  647   |   0   | 0.601708769798 |
|  648   |   0   | 0.601708769798 |
|  649   |   0   | 0.601708769798 |
|  650   |   0   | 0.601708769798 |
|  651   |   0   | 0.601708769798 |
|  652   |   0   | 0.601708710194 |
|  653   |   0   | 0.601708769798 |
|  654   |   0   | 0.601708769798 |
|  655   |   0   | 0.601708769798 |
|  656   |   0   | 0.601708769798 |
|  657   |   0   | 0.601708769798 |
|  658   |   0   | 0.601807355881 |
|  659   |   0   | 0.601708769798 |
|  660   |   0   | 0.601708769798 |
|  661   |   0   | 0.601708769798 |
|  662   |   0   | 0.601708769798 |
|  663   |   0   | 0.601708769798 |
|  664   |   0   | 0.601708710194 |
|  665   |   0   | 0.601708769798 |
|  666   |   0   | 0.601708769798 |
|  667   |   0   | 0.601708769798 |
|  668   |   0   | 0.601708769798 |
|  669   |   0   | 0.601708769798 |
|  670   |   0   | 0.68138551712  |
|  671   |   0   | 0.711133897305 |
|  672   |   0   | 0.601708769798 |
|  673   |   0   | 0.601708769798 |
|  674   |   0   | 0.697129487991 |
|  675   |   0   | 0.601708769798 |
|  676   |   0   |  0.6868237257  |
|  677   |   0   | 0.708710670471 |
|  678   |   0   | 0.601708769798 |
|  679   |   0   | 0.601708769798 |
|  680   |   0   | 0.601708769798 |
|  681   |   0   | 0.601708769798 |
|  682   |   0   | 0.601708769798 |
|  683   |   0   | 0.601708769798 |
|  684   |   0   | 0.601708769798 |
|  685   |   0   | 0.601708769798 |
|  686   |   0   | 0.622830986977 |
|  687   |   0   | 0.711140036583 |
|  688   |   0   | 0.60170841217  |
|  689   |   0   | 0.601708769798 |
|  690   |   0   | 0.603902935982 |
|  691   |   0   | 0.601708769798 |
|  692   |   0   | 0.601708769798 |
|  693   |   0   | 0.601708769798 |
|  694   |   0   | 0.601708710194 |
|  695   |   0   | 0.601708769798 |
|  696   |   0   | 0.601708769798 |
|  697   |   0   | 0.601708769798 |
|  698   |   0   | 0.617759943008 |
|  699   |   0   | 0.601708769798 |
|  700   |   0   | 0.601708769798 |
|  701   |   0   | 0.601708769798 |
|  702   |   0   | 0.601708769798 |
|  703   |   0   | 0.601708769798 |
|  704   |   0   | 0.601708769798 |
|  705   |   0   | 0.601708710194 |
|  706   |   0   | 0.702386021614 |
|  707   |   0   | 0.71113461256  |
|  708   |   0   | 0.601708769798 |
|  709   |   0   | 0.601708769798 |
|  710   |   0   | 0.601708769798 |
|  711   |   0   | 0.601708590984 |
|  712   |   0   | 0.710793733597 |
|  713   |   0   | 0.601708769798 |
|  714   |   0   | 0.601708769798 |
|  715   |   0   | 0.678676307201 |
|  716   |   0   | 0.601708769798 |
|  717   |   0   | 0.601708769798 |
|  718   |   0   | 0.601708769798 |
|  719   |   0   | 0.601708769798 |
|  720   |   0   | 0.601708769798 |
|  721   |   0   | 0.601708769798 |
|  722   |   0   | 0.601708710194 |
|  723   |   0   | 0.601708769798 |
|  724   |   0   | 0.602865934372 |
|  725   |   0   | 0.601708769798 |
|  726   |   0   | 0.601708769798 |
|  727   |   0   | 0.601708769798 |
|  728   |   0   | 0.601723909378 |
|  729   |   0   | 0.601708769798 |
|  730   |   0   | 0.601708769798 |
|  731   |   0   | 0.601708769798 |
|  732   |   0   | 0.601708769798 |
|  733   |   0   | 0.601708710194 |
|  734   |   0   | 0.709831953049 |
|  735   |   0   | 0.601708769798 |
|  736   |   0   | 0.601708769798 |
|  737   |   0   | 0.601708769798 |
|  738   |   0   | 0.601716935635 |
|  739   |   0   | 0.601708769798 |
|  740   |   0   | 0.601708114147 |
|  741   |   0   | 0.601708769798 |
|  742   |   0   | 0.601708769798 |
|  743   |   0   | 0.601708769798 |
|  744   |   0   | 0.601707577705 |
|  745   |   0   | 0.601708769798 |
|  746   |   0   | 0.601708769798 |
|  747   |   0   | 0.601708650589 |
|  748   |   0   | 0.601708769798 |
|  749   |   0   | 0.601708769798 |
|  750   |   0   | 0.601708769798 |
|  751   |   0   | 0.601708769798 |
|  752   |   0   | 0.601708769798 |
|  753   |   0   | 0.601708769798 |
|  754   |   0   | 0.601708769798 |
|  755   |   0   | 0.601708769798 |
|  756   |   0   | 0.601708769798 |
|  757   |   0   | 0.601708769798 |
|  758   |   0   | 0.617472469807 |
|  759   |   0   | 0.601708769798 |
|  760   |   0   | 0.601708769798 |
|  761   |   0   | 0.601708769798 |
|  762   |   0   | 0.601708650589 |
|  763   |   0   | 0.601708769798 |
|  764   |   0   | 0.601708710194 |
|  765   |   0   | 0.601708769798 |
|  766   |   0   | 0.601708769798 |
|  767   |   0   | 0.601708769798 |
|  768   |   0   | 0.601708710194 |
|  769   |   0   | 0.601708769798 |
|  770   |   0   | 0.601708710194 |
|  771   |   0   | 0.601708769798 |
|  772   |   0   | 0.601708710194 |
|  773   |   0   | 0.601708769798 |
|  774   |   0   | 0.601708769798 |
|  775   |   0   | 0.601708769798 |
|  776   |   0   | 0.601708769798 |
|  777   |   0   | 0.604103326797 |
|  778   |   0   | 0.601708769798 |
|  779   |   0   | 0.601708769798 |
|  780   |   0   | 0.601708769798 |
|  781   |   0   | 0.601708769798 |
|  782   |   0   | 0.694732546806 |
|  783   |   0   | 0.711144328117 |
|  784   |   0   | 0.601708769798 |
|  785   |   0   | 0.601708769798 |
|  786   |   0   | 0.601708769798 |
|  787   |   0   | 0.601708769798 |
|  788   |   0   | 0.601708769798 |
|  789   |   0   | 0.601708769798 |
|  790   |   0   | 0.601708769798 |
|  791   |   0   | 0.601708769798 |
|  792   |   0   | 0.601708769798 |
|  793   |   0   | 0.601708769798 |
|  794   |   0   | 0.601708710194 |
|  795   |   0   | 0.601708769798 |
|  796   |   0   | 0.601708769798 |
|  797   |   0   | 0.601708590984 |
|  798   |   0   | 0.710192680359 |
|  799   |   0   | 0.601708769798 |
|  800   |   0   | 0.601708769798 |
|  801   |   0   | 0.601708769798 |
|  802   |   0   | 0.601708769798 |
|  803   |   0   | 0.601708769798 |
|  804   |   0   | 0.601708769798 |
|  805   |   0   | 0.601708769798 |
|  806   |   0   | 0.601708769798 |
|  807   |   0   | 0.601708710194 |
|  808   |   0   | 0.602081179619 |
|  809   |   0   | 0.601708769798 |
|  810   |   0   | 0.601708769798 |
|  811   |   0   | 0.601708769798 |
|  812   |   0   | 0.601708650589 |
|  813   |   0   | 0.601708769798 |
|  814   |   0   | 0.601708769798 |
|  815   |   0   | 0.601708650589 |
|  816   |   0   | 0.601708769798 |
|  817   |   0   | 0.601708769798 |
|  818   |   0   | 0.601708769798 |
|  819   |   0   | 0.601708769798 |
|  820   |   0   | 0.601708769798 |
|  821   |   0   | 0.601723313332 |
|  822   |   0   | 0.601708769798 |
|  823   |   0   | 0.601708769798 |
|  824   |   0   | 0.601708769798 |
|  825   |   0   | 0.601708769798 |
|  826   |   0   | 0.708353102207 |
|  827   |   0   | 0.601708769798 |
|  828   |   0   | 0.601708769798 |
|  829   |   0   | 0.601707518101 |
|  830   |   0   | 0.711142897606 |
|  831   |   0   | 0.601708769798 |
|  832   |   0   | 0.601708769798 |
|  833   |   0   | 0.601708769798 |
|  834   |   0   | 0.601708769798 |
|  835   |   0   | 0.601708769798 |
|  836   |   0   | 0.601708769798 |
|  837   |   0   | 0.601708769798 |
|  838   |   0   | 0.601708471775 |
|  839   |   0   | 0.601708769798 |
|  840   |   0   | 0.601708769798 |
|  841   |   0   | 0.601709365845 |
|  842   |   0   | 0.601708590984 |
|  843   |   0   | 0.601708769798 |
|  844   |   0   | 0.601708769798 |
|  845   |   0   | 0.601706922054 |
|  846   |   0   | 0.601708769798 |
|  847   |   0   | 0.601708769798 |
|  848   |   0   | 0.601708769798 |
|  849   |   0   | 0.601708769798 |
|  850   |   0   | 0.601708769798 |
|  851   |   0   | 0.601708769798 |
|  852   |   0   | 0.633008480072 |
|  853   |   0   | 0.601708769798 |
|  854   |   0   | 0.601708769798 |
|  855   |   0   | 0.709636151791 |
|  856   |   0   | 0.601708769798 |
|  857   |   0   | 0.601708710194 |
|  858   |   0   | 0.601708769798 |
|  859   |   0   | 0.601708769798 |
|  860   |   0   | 0.601708769798 |
|  861   |   0   | 0.601708769798 |
|  862   |   0   | 0.679934322834 |
|  863   |   0   | 0.601708769798 |
|  864   |   0   | 0.601708769798 |
|  865   |   0   | 0.601708769798 |
|  866   |   0   | 0.601708769798 |
|  867   |   0   | 0.601707696915 |
|  868   |   0   | 0.601708769798 |
|  869   |   0   | 0.601708769798 |
|  870   |   0   | 0.601708769798 |
|  871   |   0   | 0.601708769798 |
|  872   |   0   | 0.601708769798 |
|  873   |   0   | 0.601708769798 |
|  874   |   0   | 0.601708769798 |
|  875   |   0   | 0.604016840458 |
|  876   |   0   | 0.601708769798 |
|  877   |   0   | 0.601708769798 |
|  878   |   0   | 0.601708769798 |
|  879   |   0   | 0.601708769798 |
|  880   |   0   | 0.601708769798 |
|  881   |   0   | 0.601708769798 |
|  882   |   0   | 0.601708769798 |
|  883   |   0   | 0.601708769798 |
|  884   |   0   | 0.601708769798 |
|  885   |   0   | 0.601708769798 |
|  886   |   0   | 0.601708769798 |
|  887   |   0   | 0.601724386215 |
|  888   |   0   | 0.601708769798 |
|  889   |   0   | 0.683948278427 |
|  890   |   0   | 0.686399102211 |
|  891   |   0   | 0.711137592793 |
|  892   |   0   | 0.601708769798 |
|  893   |   0   | 0.601708769798 |
|  894   |   0   | 0.601708769798 |
|  895   |   0   | 0.601708710194 |
|  896   |   0   | 0.601708769798 |
|  897   |   0   | 0.601708769798 |
|  898   |   0   | 0.601707935333 |
|  899   |   0   | 0.601709783077 |
|  900   |   0   | 0.601708769798 |
|  901   |   0   | 0.601708769798 |
|  902   |   0   | 0.601708769798 |
|  903   |   0   | 0.601708769798 |
|  904   |   0   | 0.689202666283 |
|  905   |   0   | 0.601708769798 |
|  906   |   0   | 0.601708769798 |
|  907   |   0   | 0.601709187031 |
|  908   |   0   | 0.711139559746 |
|  909   |   0   | 0.601708769798 |
|  910   |   0   | 0.601708769798 |
|  911   |   0   | 0.601708769798 |
|  912   |   0   | 0.601708769798 |
|  913   |   0   | 0.601708769798 |
|  914   |   0   | 0.601708769798 |
|  915   |   0   | 0.601708769798 |
|  916   |   0   | 0.711101830006 |
|  917   |   0   | 0.601708769798 |
|  918   |   0   | 0.601708769798 |
|  919   |   0   | 0.601708769798 |
|  920   |   0   | 0.601708769798 |
|  921   |   0   | 0.601708769798 |
|  922   |   0   | 0.601708769798 |
|  923   |   0   | 0.601708769798 |
|  924   |   0   | 0.601708769798 |
|  925   |   0   | 0.601708590984 |
|  926   |   0   | 0.601708769798 |
|  927   |   0   | 0.601708769798 |
|  928   |   0   | 0.601708710194 |
|  929   |   0   | 0.601708769798 |
|  930   |   0   | 0.601708769798 |
|  931   |   0   | 0.601708590984 |
|  932   |   0   | 0.602406144142 |
|  933   |   0   | 0.601708769798 |
|  934   |   0   | 0.601708769798 |
|  935   |   0   | 0.71094506979  |
|  936   |   0   | 0.601708114147 |
|  937   |   0   | 0.601708769798 |
|  938   |   0   | 0.601708769798 |
|  939   |   0   | 0.601708769798 |
|  940   |   0   | 0.601708769798 |
|  941   |   0   | 0.601837575436 |
|  942   |   0   | 0.601708769798 |
|  943   |   0   | 0.601708769798 |
|  944   |   0   | 0.601708769798 |
|  945   |   0   | 0.601708769798 |
|  946   |   0   | 0.601708769798 |
|  947   |   0   | 0.610161483288 |
|  948   |   0   | 0.71114307642  |
|  949   |   0   | 0.601708710194 |
|  950   |   0   | 0.601708769798 |
|  951   |   0   | 0.601708769798 |
|  952   |   0   | 0.601708710194 |
|  953   |   0   | 0.601708769798 |
|  954   |   0   | 0.601708769798 |
|  955   |   0   | 0.601708769798 |
|  956   |   0   | 0.601708769798 |
|  957   |   0   | 0.601708710194 |
|  958   |   0   | 0.711146891117 |
|  959   |   0   | 0.601708769798 |
|  960   |   0   | 0.601708769798 |
|  961   |   0   | 0.601708769798 |
|  962   |   0   | 0.601708769798 |
|  963   |   0   | 0.601708769798 |
|  964   |   0   | 0.601708769798 |
|  965   |   0   | 0.601708769798 |
|  966   |   0   | 0.601708769798 |
|  967   |   0   | 0.601708769798 |
|  968   |   0   | 0.601708769798 |
|  969   |   0   | 0.67612016201  |
|  970   |   0   | 0.601708769798 |
|  971   |   0   | 0.601708769798 |
|  972   |   0   | 0.601708769798 |
|  973   |   0   | 0.601708769798 |
|  974   |   0   | 0.601708769798 |
|  975   |   0   | 0.601708769798 |
|  976   |   0   | 0.601708769798 |
|  977   |   0   | 0.601708769798 |
|  978   |   0   | 0.601708769798 |
|  979   |   0   | 0.601708769798 |
|  980   |   0   | 0.601708710194 |
|  981   |   0   | 0.601708769798 |
|  982   |   0   | 0.601708769798 |
|  983   |   0   | 0.601708769798 |
|  984   |   0   | 0.601708769798 |
|  985   |   0   | 0.601708769798 |
|  986   |   0   | 0.601708769798 |
|  987   |   0   | 0.710861384869 |
|  988   |   0   | 0.601708710194 |
|  989   |   0   | 0.601708769798 |
|  990   |   0   | 0.711148202419 |
|  991   |   0   | 0.601708769798 |
|  992   |   0   | 0.601708769798 |
|  993   |   0   | 0.601708769798 |
|  994   |   0   | 0.601707756519 |
|  995   |   0   | 0.601708769798 |
|  996   |   0   | 0.601708769798 |
|  997   |   0   | 0.601708769798 |
|  998   |   0   | 0.601708769798 |
|  999   |   0   | 0.601708769798 |
+--------+-------+----------------+
[100000 rows x 3 columns]

In [310]:
sf_valpred = nn.evaluate(sf_subval)
sf_valpred
Out[310]:
{'accuracy': 0.6335600018501282, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 2
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        | 31678 |
 |      1       |        0        | 18322 |
 +--------------+-----------------+-------+
 [2 rows x 3 columns]}
In [318]:
nn.show()
Canvas is updated and available in a tab in the default browser.

Logistic regression

In [ ]:
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
In [462]:
logistic = graphlab.logistic_classifier.create(sf_subtrain, 'X15', l2_penalty=0.5)
trainpred_log = logistic.classify(sf_subtrain)
valpred_log = logistic.classify(sf_subval)
sf_traineval_log = logistic.evaluate(sf_subtrain)
sf_valeval_log = logistic.evaluate(sf_subval)
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Logistic regression:
--------------------------------------------------------
Number of examples          : 142579
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
Number of coefficients    : 15
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.454812     | 0.675457          | 0.677806            |
| 2         | 3        | 0.771281     | 0.675485          | 0.676324            |
| 3         | 4        | 1.076713     | 0.675457          | 0.676324            |
| 4         | 5        | 1.363131     | 0.675457          | 0.676324            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

In [463]:
print sf_traineval, sf_valeval
{'f1_score': 0.4371414825506088, 'auc': 0.7008313081228902, 'recall': 0.32172428820453225, 'precision': 0.6816975106767728, 'log_loss': 0.5967707929514143, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 55072 | 94928 |
|   1e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   2e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   3e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   4e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   5e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   6e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   7e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   8e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   9e-05   | 1.0 | 1.0 | 55072 | 94928 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 86655 |
|      0       |        1        |  8273 |
|      1       |        0        | 37354 |
|      1       |        1        | 17718 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69582} {'f1_score': 0.43383947939262474, 'auc': 0.6957934954250963, 'recall': 0.3210534907765141, 'precision': 0.6687825889820902, 'log_loss': 0.5997670642227892, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 1001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18377 | 31623 |
|   0.001   | 1.0 | 1.0 | 18377 | 31623 |
|   0.002   | 1.0 | 1.0 | 18377 | 31623 |
|   0.003   | 1.0 | 1.0 | 18377 | 31623 |
|   0.004   | 1.0 | 1.0 | 18377 | 31623 |
|   0.005   | 1.0 | 1.0 | 18377 | 31623 |
|   0.006   | 1.0 | 1.0 | 18377 | 31623 |
|   0.007   | 1.0 | 1.0 | 18377 | 31623 |
|   0.008   | 1.0 | 1.0 | 18377 | 31623 |
|   0.009   | 1.0 | 1.0 | 18377 | 31623 |
+-----------+-----+-----+-------+-------+
[1001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2922 |
|      1       |        0        | 12477 |
|      1       |        1        |  5900 |
|      0       |        0        | 28701 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69202}
In [464]:
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_log['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_log['class'])
0.579818364572
0.579819529846
In [1028]:
logistic.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [924]:
testpred = logistic.classify(sf_xtest)
testpred.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [925]:
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred_log
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
In [926]:
check = graphlab.SArray.to_numpy(temp['prob2'])
check_log = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_log = check_train

Random Forests

In [372]:
rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
trainpred = rf.classify(sf_subtrain)
valpred = rf.classify(sf_subval)
sf_traineval = rf.evaluate(sf_subtrain)
sf_valeval = rf.evaluate(sf_subval)
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Random forest classifier:
--------------------------------------------------------
Number of examples          : 94876
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.117422     | 0.686496          | 0.676620            | 0.601860          | 0.609514            |
| 2         | 0.231075     | 0.692367          | 0.682084            | 0.598750          | 0.605040            |
| 3         | 0.342031     | 0.692198          | 0.681108            | 0.598544          | 0.604782            |
| 4         | 0.450370     | 0.692377          | 0.682084            | 0.597898          | 0.604125            |
| 5         | 0.562456     | 0.694496          | 0.683841            | 0.597620          | 0.603579            |
| 6         | 0.668845     | 0.695698          | 0.691062            | 0.596476          | 0.602367            |
| 10        | 1.063360     | 0.695202          | 0.693404            | 0.596358          | 0.601329            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
In [373]:
print sf_traineval, sf_valeval
{'f1_score': 0.43560837452101964, 'auc': 0.7017076019467078, 'recall': 0.32016326530612244, 'precision': 0.681257599444155, 'log_loss': 0.5966123630493014, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 36750 | 63250 |
|   1e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   2e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   3e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   4e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   5e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   6e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   7e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   8e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   9e-05   | 1.0 | 1.0 | 36750 | 63250 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 57745 |
|      0       |        1        |  5505 |
|      1       |        1        | 11766 |
|      1       |        0        | 24984 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69511} {'f1_score': 0.4292321924144311, 'auc': 0.6942995940781368, 'recall': 0.31655932758432487, 'precision': 0.6664368608525796, 'log_loss': 0.5998504683475016, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18322 | 31678 |
|   1e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   2e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   3e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   4e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   5e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   6e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   7e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   8e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   9e-05   | 1.0 | 1.0 | 18322 | 31678 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2903 |
|      0       |        0        | 28775 |
|      1       |        0        | 12522 |
|      1       |        1        |  5800 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.6915}
In [374]:
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])
0.602631049447
0.59795433189
In [375]:
rf.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [927]:
testpred = rf.classify(sf_xtest)
In [928]:
testpred.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [929]:
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
In [930]:
check = graphlab.SArray.to_numpy(temp['prob2'])
check_rf = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_rf = check_train
In [420]:
# 0.65124 Kaggle - Vanilla Random Forests
# rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
np.savetxt('Yhat_rfgraphlab_test1.txt', np.vstack( (np.arange(len(check)) , 
                                          check) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
In [ ]:
 

Random forests experimentation

In [422]:
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
In [445]:
#featlist = ['X1', 'X14', 'X9', 'X3', 'X13', 'X4']
rf2 = graphlab.random_forest_classifier.create(sf_subtrain, 'X15', 
                                               max_iterations=30, min_child_weight = 10, validation_set=sf_subval
                                              , random_seed = 0)
trainpred = rf2.classify(sf_subtrain)
valpred = rf2.classify(sf_subval)
sf_traineval = rf2.evaluate(sf_subtrain)
sf_valeval = rf2.evaluate(sf_subval)
Random forest classifier:
--------------------------------------------------------
Number of examples          : 150000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.230538     | 0.683740          | 0.682520            | 0.608511          | 0.610511            |
| 2         | 0.414681     | 0.689253          | 0.687360            | 0.601690          | 0.604088            |
| 3         | 0.595938     | 0.694033          | 0.691300            | 0.598567          | 0.601582            |
| 4         | 0.769913     | 0.693947          | 0.690180            | 0.597894          | 0.600773            |
| 5         | 0.949722     | 0.694147          | 0.690400            | 0.597268          | 0.599949            |
| 6         | 1.128504     | 0.693527          | 0.691020            | 0.597977          | 0.600661            |
| 10        | 1.792744     | 0.694280          | 0.691580            | 0.597381          | 0.600317            |
| 11        | 1.975053     | 0.694013          | 0.691080            | 0.597717          | 0.600639            |
| 20        | 3.488528     | 0.695347          | 0.692480            | 0.596979          | 0.599864            |
| 30        | 5.127005     | 0.695820          | 0.692020            | 0.596771          | 0.599767            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
In [447]:
print sf_traineval, sf_valeval
{'f1_score': 0.4371414825506088, 'auc': 0.7008313081228902, 'recall': 0.32172428820453225, 'precision': 0.6816975106767728, 'log_loss': 0.5967707929514143, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 55072 | 94928 |
|   1e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   2e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   3e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   4e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   5e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   6e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   7e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   8e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   9e-05   | 1.0 | 1.0 | 55072 | 94928 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 86655 |
|      0       |        1        |  8273 |
|      1       |        0        | 37354 |
|      1       |        1        | 17718 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69582} {'f1_score': 0.43383947939262474, 'auc': 0.6957934954250963, 'recall': 0.3210534907765141, 'precision': 0.6687825889820902, 'log_loss': 0.5997670642227892, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18377 | 31623 |
|   1e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   2e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   3e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   4e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   5e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   6e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   7e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   8e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   9e-05   | 1.0 | 1.0 | 18377 | 31623 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2922 |
|      1       |        0        | 12477 |
|      1       |        1        |  5900 |
|      0       |        0        | 28701 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69202}
In [446]:
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])
0.603267835593
0.599493347228
In [448]:
rf2.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [ ]:
 

Appending logistic and rf predictions together

In [483]:
#print valpred_log, valpred
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])
temp = np.column_stack((logclass, rfclass, valclass))
#bothclass = np.ndarray((len(logclass, )))
In [579]:
log_vs_val = (temp[:, 0] == temp[:, 2]).astype(int)
rf_vs_val = (temp[:, 1] == temp[:, 2]).astype(int)
temp2 = np.column_stack((temp, log_vs_val, rf_vs_val))
print temp2.shape
print temp2
len(np.where(log_vs_val==rf_vs_val)[0])
(50000, 5)
[[0 0 0 1 1]
 [0 0 0 1 1]
 [0 0 0 1 1]
 ..., 
 [0 0 1 0 0]
 [0 0 0 1 1]
 [0 0 1 0 0]]
Out[579]:
45796
In [592]:
final_val = np.maximum(logclass, rfclass)
print final_val
print "Validation error: ", float(np.sum(rfclass != valclass))/len(valclass)
[0 0 0 ..., 0 0 0]
Validation error:  0.30798

Unique rows in the data

In [362]:
np.vstack({tuple(row) for row in X_train})
Out[362]:
array([[ 246.    ,  235.    ,  244.66  , ...,    2.5391,   20.    ,    0.    ],
       [ 252.    ,  232.    ,  247.01  , ...,    3.2324,   20.    ,    0.    ],
       [ 239.    ,  232.    ,  242.61  , ...,    3.0288,   11.449 ,    0.    ],
       ..., 
       [ 253.    ,  235.    ,  243.54  , ...,    2.0576,   20.    ,    0.    ],
       [ 241.66  ,  226.    ,  242.27  , ...,    1.3927,    4.9498,    0.    ],
       [ 241.    ,  236.    ,  245.49  , ...,    1.5416,   20.    ,   38.8   ]])
In [ ]:
 

AdaBoost

In [601]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
In [604]:
dt_xtr, dt_xtest, dt_ytr, dt_ytest = ml.splitData(X_train, Y_train, 0.75)
In [605]:
params={'criterion' : 'entropy',
        'max_depth' : 7,
       #'min_samples_split' : 350,
       #'min_samples_leaf': 50,
       'class_weight':'balanced'}

bdt = AdaBoostClassifier(DecisionTreeClassifier(**params),
                         algorithm="SAMME",
                         n_estimators=150)

bdt.fit(dt_xtr, dt_ytr)
print bdt.score(dt_xtest,dt_ytest)
fpr = dict()
tpr = dict()
roc_auc = dict()
ypred2=bdt.predict_proba(dt_xtest)
0.6598
In [606]:
fpr, tpr, _ = roc_curve(dt_ytest, ypred2[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr,tpr,'b-')
print roc_auc
0.703400132221
In [ ]:
 

SVM - not useful

In [777]:
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
In [806]:
svm = graphlab.svm_classifier.create(sf_subtrain, 'X15', max_iterations=70, class_weights={0:1, 1:1.5}, 
                                     validation_set=sf_subval, convergence_threshold = 0.001)
trainpred_svm = svm.classify(sf_subtrain)
valpred_svm = svm.classify(sf_subval)
traineval_svm = svm.evaluate(sf_subtrain)
valeval_svm = svm.evaluate(sf_subval)
SVM:
--------------------------------------------------------
Number of examples          : 150000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
Number of coefficients    : 15
Starting L-BFGS
--------------------------------------------------------
+-----------+----------+-----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+-----------+--------------+-------------------+---------------------+
| 1         | 3        | 0.000007  | 0.330377     | 0.633700          | 0.633120            |
| 2         | 8        | 0.069077  | 0.872175     | 0.633867          | 0.633180            |
| 3         | 9        | 0.069077  | 1.096486     | 0.633967          | 0.633300            |
| 4         | 10       | 0.069077  | 1.311808     | 0.634393          | 0.633360            |
| 5         | 11       | 0.069077  | 1.525204     | 0.636147          | 0.635480            |
| 6         | 12       | 0.069077  | 1.723739     | 0.637547          | 0.636520            |
| 10        | 16       | 0.069077  | 2.550655     | 0.656447          | 0.655420            |
| 11        | 17       | 0.069077  | 2.766868     | 0.657273          | 0.656760            |
| 15        | 21       | 0.069077  | 3.648842     | 0.659380          | 0.658820            |
| 20        | 26       | 0.069077  | 4.744220     | 0.656227          | 0.656820            |
| 25        | 31       | 0.069077  | 5.877777     | 0.654033          | 0.654460            |
| 30        | 36       | 0.069077  | 6.878247     | 0.654387          | 0.654000            |
| 35        | 41       | 0.069077  | 7.943628     | 0.653607          | 0.653500            |
| 40        | 46       | 0.069077  | 8.934414     | 0.653407          | 0.653100            |
| 45        | 51       | 0.069077  | 9.989597     | 0.653013          | 0.652340            |
| 50        | 56       | 0.069077  | 10.980227    | 0.653273          | 0.652400            |
| 51        | 57       | 0.069077  | 11.183891    | 0.653340          | 0.652360            |
| 55        | 61       | 0.069077  | 11.936650    | 0.652380          | 0.651720            |
| 60        | 66       | 0.069077  | 12.915070    | 0.652907          | 0.652880            |
| 65        | 71       | 0.069077  | 13.982901    | 0.655573          | 0.656060            |
| 70        | 76       | 0.069077  | 14.947583    | 0.659480          | 0.659380            |
+-----------+----------+-----------+--------------+-------------------+---------------------+
TERMINATED: Iteration limit reached.
This model may not be optimal. To improve it, consider increasing `max_iterations`.
In [807]:
svm.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [808]:
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_svm['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_svm['class'])
0.56565434265
0.565826081144
In [878]:
testpred = svm.classify(sf_xtest)
#temp = testpred
#temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
In [879]:
testpred
Out[879]:
class
0
0
0
0
0
1
0
0
1
1
[200000 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [ ]:
 

Boosted trees

In [818]:
bt = graphlab.boosted_trees_classifier.create(sf_subtrain, 'X15', max_depth=8, validation_set=sf_subval, 
                                              row_subsample=0.85, column_subsample = 0.6, random_seed=1)
trainpred_bt = bt.classify(sf_subtrain)
valpred_bt = bt.classify(sf_subval)
traineval_bt = bt.evaluate(sf_subtrain)
valeval_bt = bt.evaluate(sf_subval)
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 150000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
| 1         | 0.242876     | 0.686453          | 0.678780            | 0.650155          | 0.652594            |
| 2         | 0.428974     | 0.694380          | 0.686000            | 0.623521          | 0.627618            |
| 3         | 0.627595     | 0.701993          | 0.693380            | 0.601988          | 0.608284            |
| 4         | 0.807614     | 0.704327          | 0.697580            | 0.591579          | 0.599223            |
| 5         | 1.025419     | 0.705473          | 0.697520            | 0.584681          | 0.593741            |
| 6         | 1.233451     | 0.707113          | 0.697960            | 0.579146          | 0.589718            |
| 10        | 1.933667     | 0.722260          | 0.708820            | 0.554972          | 0.570785            |
+-----------+--------------+-------------------+---------------------+-------------------+---------------------+
In [817]:
print traineval_bt, valeval_bt
{'f1_score': 0.5914221218961625, 'auc': 0.8290722367030955, 'recall': 0.4704419889502762, 'precision': 0.7961664329125759, 'log_loss': 0.505644897670549, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+------+-------+
| threshold | fpr | tpr |  p   |   n   |
+-----------+-----+-----+------+-------+
|    0.0    | 1.0 | 1.0 | 7240 | 12760 |
|   1e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   2e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   3e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   4e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   5e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   6e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   7e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   8e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   9e-05   | 1.0 | 1.0 | 7240 | 12760 |
+-----------+-----+-----+------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  872  |
|      0       |        0        | 11888 |
|      1       |        1        |  3406 |
|      1       |        0        |  3834 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.7647} {'f1_score': 0.4734636871508379, 'auc': 0.7151508053023574, 'recall': 0.3689795918367347, 'precision': 0.6604968339016074, 'log_loss': 0.5850111528423768, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 1001

Data:
+-----------+-----+-----+------+------+
| threshold | fpr | tpr |  p   |  n   |
+-----------+-----+-----+------+------+
|    0.0    | 1.0 | 1.0 | 3675 | 6325 |
|   0.001   | 1.0 | 1.0 | 3675 | 6325 |
|   0.002   | 1.0 | 1.0 | 3675 | 6325 |
|   0.003   | 1.0 | 1.0 | 3675 | 6325 |
|   0.004   | 1.0 | 1.0 | 3675 | 6325 |
|   0.005   | 1.0 | 1.0 | 3675 | 6325 |
|   0.006   | 1.0 | 1.0 | 3675 | 6325 |
|   0.007   | 1.0 | 1.0 | 3675 | 6325 |
|   0.008   | 1.0 | 1.0 | 3675 | 6325 |
|   0.009   | 1.0 | 1.0 | 3675 | 6325 |
+-----------+-----+-----+------+------+
[1001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  1356 |
|      1       |        0        |  2319 |
|      0       |        0        |  5628 |
|      0       |        1        |  697  |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.6984}
In [744]:
print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_bt['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_bt['class'])
0.684977051906
0.609060643704
In [1027]:
bt.show()
Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.
In [936]:
testpred = bt.classify(sf_xtest)
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred_bt
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])
In [937]:
check = graphlab.SArray.to_numpy(temp['prob2'])
check_bt = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_bt = check_train
In [674]:
#np.savetxt('Yhat_btgraphlab_test1.txt', np.vstack( (np.arange(len(check)) , 
#                                          check) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
np.savetxt('Yhat_btgraphlab_test2.txt', np.vstack( (np.arange(len(check)) , 
                                          check) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64205
In [748]:
testpred.show()
Canvas is updated and available in a tab in the default browser.
In [675]:
print "Validation error: ", float(np.sum(graphlab.SArray.to_numpy(valpred_bt['class']) != valclass))/len(valclass)
Validation error:  0.28148
In [ ]:
 

Cross validation

In [ ]:
sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]
In [647]:
nFolds = 5; # Initialize number of folds to be 5
J = np.empty([ nFolds], dtype=float) 
# Create an empty (M, 5) float array to store the 5-fold validation
# errors from the different models
folds = graphlab.cross_validation.KFold(sf_alltrain, 5)

for train, valid in folds:
    m = graphlab.boosted_trees_classifier.create(train, target='X15', validation_set=None, max_depth=20)
    print m.evaluate(valid)

#J[iFold] = float(np.sum(Yvi_pred != Yvi))/len(Yvi)
#Jmean = np.mean(J) # Overall estimated validation performance for each model
#print "Cross Validation error: \n", Jmean
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 160000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+-------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |
+-----------+--------------+-------------------+-------------------+
| 1         | 0.633326     | 0.845638          | 0.576053          |
| 2         | 1.273794     | 0.858250          | 0.498056          |
| 3         | 1.974681     | 0.868019          | 0.443528          |
| 4         | 2.675147     | 0.876212          | 0.401972          |
| 5         | 3.358845     | 0.882044          | 0.370599          |
| 6         | 4.026292     | 0.885944          | 0.347881          |
| 10        | 6.399433     | 0.896594          | 0.293605          |
+-----------+--------------+-------------------+-------------------+
{'f1_score': 0.5980128492454804, 'auc': 0.7843021853779651, 'recall': 0.5484755053100376, 'precision': 0.657386876899072, 'log_loss': 0.5398622115874141, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 14595 | 25405 |
|   1e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   2e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   3e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   4e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   5e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   6e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   7e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   8e-05   | 1.0 | 1.0 | 14595 | 25405 |
|   9e-05   | 1.0 | 1.0 | 14595 | 25405 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  4172 |
|      0       |        0        | 21233 |
|      1       |        1        |  8005 |
|      1       |        0        |  6590 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.73095}
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 160000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+-------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |
+-----------+--------------+-------------------+-------------------+
| 1         | 0.677555     | 0.848531          | 0.573501          |
| 2         | 1.269722     | 0.859237          | 0.495696          |
| 3         | 1.928695     | 0.869094          | 0.442501          |
| 4         | 2.604838     | 0.876962          | 0.400996          |
| 5         | 3.331844     | 0.881063          | 0.370884          |
| 6         | 3.946533     | 0.884756          | 0.348289          |
| 10        | 7.227532     | 0.898462          | 0.289270          |
+-----------+--------------+-------------------+-------------------+
{'f1_score': 0.596291754913982, 'auc': 0.7799892336942237, 'recall': 0.5433001557316, 'precision': 0.660737812911726, 'log_loss': 0.5464200159540089, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 14769 | 25231 |
|   1e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   2e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   3e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   4e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   5e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   6e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   7e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   8e-05   | 1.0 | 1.0 | 14769 | 25231 |
|   9e-05   | 1.0 | 1.0 | 14769 | 25231 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  4120 |
|      1       |        0        |  6745 |
|      1       |        1        |  8024 |
|      0       |        0        | 21111 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.728375}
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 160000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+-------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |
+-----------+--------------+-------------------+-------------------+
| 1         | 0.688233     | 0.849825          | 0.573674          |
| 2         | 1.309993     | 0.859981          | 0.495871          |
| 3         | 1.961848     | 0.871487          | 0.440134          |
| 4         | 2.639231     | 0.877200          | 0.399953          |
| 5         | 3.289044     | 0.882169          | 0.369871          |
| 6         | 4.019849     | 0.888031          | 0.344141          |
| 10        | 7.298780     | 0.897975          | 0.291182          |
+-----------+--------------+-------------------+-------------------+
{'f1_score': 0.6013919063224952, 'auc': 0.783177035734654, 'recall': 0.5526155511944237, 'precision': 0.6596122778675283, 'log_loss': 0.5428008076407402, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 14777 | 25223 |
|   1e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   2e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   3e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   4e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   5e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   6e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   7e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   8e-05   | 1.0 | 1.0 | 14777 | 25223 |
|   9e-05   | 1.0 | 1.0 | 14777 | 25223 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  8166 |
|      1       |        0        |  6611 |
|      0       |        1        |  4214 |
|      0       |        0        | 21009 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.729375}
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 160000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+-------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |
+-----------+--------------+-------------------+-------------------+
| 1         | 0.829356     | 0.851006          | 0.573496          |
| 2         | 1.797698     | 0.858675          | 0.497662          |
| 3         | 2.635094     | 0.867231          | 0.444345          |
| 4         | 3.754461     | 0.876719          | 0.400249          |
| 5         | 4.517519     | 0.882437          | 0.368190          |
| 6         | 5.352758     | 0.885994          | 0.345684          |
| 10        | 8.011625     | 0.897237          | 0.292011          |
+-----------+--------------+-------------------+-------------------+
{'f1_score': 0.5942297708225173, 'auc': 0.778279828628763, 'recall': 0.546595715650157, 'precision': 0.6509587260318492, 'log_loss': 0.5471402647439716, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 14658 | 25342 |
|   1e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   2e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   3e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   4e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   5e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   6e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   7e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   8e-05   | 1.0 | 1.0 | 14658 | 25342 |
|   9e-05   | 1.0 | 1.0 | 14658 | 25342 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  4296 |
|      0       |        0        | 21046 |
|      1       |        1        |  8012 |
|      1       |        0        |  6646 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.72645}
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 160000
Number of classes           : 2
Number of feature columns   : 14
Number of unpacked features : 14
+-----------+--------------+-------------------+-------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |
+-----------+--------------+-------------------+-------------------+
| 1         | 0.674275     | 0.847894          | 0.574861          |
| 2         | 1.407280     | 0.859513          | 0.496742          |
| 3         | 2.126286     | 0.869331          | 0.443081          |
| 4         | 2.846260     | 0.877181          | 0.400901          |
| 5         | 3.514413     | 0.883038          | 0.369306          |
| 6         | 4.160718     | 0.887669          | 0.344819          |
| 10        | 6.620531     | 0.897781          | 0.291151          |
+-----------+--------------+-------------------+-------------------+
{'f1_score': 0.5953276947725327, 'auc': 0.7820972406782905, 'recall': 0.5453242320819113, 'precision': 0.6554270243662319, 'log_loss': 0.5434684182488563, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 14650 | 25350 |
|   1e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   2e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   3e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   4e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   5e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   6e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   7e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   8e-05   | 1.0 | 1.0 | 14650 | 25350 |
|   9e-05   | 1.0 | 1.0 | 14650 | 25350 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  4200 |
|      0       |        0        | 21150 |
|      1       |        1        |  7989 |
|      1       |        0        |  6661 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.728475}
In [ ]:
 

Final Ensemble - Random forest, SVM, Boosted trees, Log reg

In [868]:
#learner = svm (from scipy), svm (graphlab), bt, rf, logistic 
from sklearn.externals import joblib
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
In [770]:
# Save all the learners
joblib.dump(learner, 'svm.pkl') 
Out[770]:
['svm.pkl']
In [809]:
svm.save('svm')
bt.save('bt')
rf.save('rf')
logistic.save('logistic')
In [820]:
valpred_bt.shape
Out[820]:
(50000, 2)

Ensembling with hard predictions

In [884]:
# Only validation data
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
btclass = graphlab.SArray.to_numpy(valpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(valpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])

#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))
In [891]:
# Only training data
logclass = graphlab.SArray.to_numpy(trainpred_log['class'])
rfclass = graphlab.SArray.to_numpy(trainpred['class'])
btclass = graphlab.SArray.to_numpy(trainpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(trainpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])

#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))
In [895]:
# gnb = GaussianNB()
gnb = RandomForestClassifier(max_features=None)

# If NB on training data
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 120000, 150000)

# If NB on validation data
#temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 35000, 50000)
gnb.fit(temptrain, trainclass)
Out[895]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [896]:
print temptrain.shape, temptest.shape, temp.shape, trainclass.shape, testclass.shape
(120000, 3) (30000, 3) (150000, 3) (120000,) (30000,)
In [897]:
gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)
In [898]:
print gnb_train_pred, trainclass
[1 0 1 ..., 0 0 0] [1 0 1 ..., 0 0 0]
In [899]:
print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass), '\n'
print "All error: ", float(np.sum(gnb_all_pred != valclass))/len(valclass), '\n'
print "SVM error: ", float(np.sum(svmclass != valclass))/len(valclass), '\n'
print "RF error: ", float(np.sum(rfclass != valclass))/len(valclass), '\n'
print "Logistic error: ", float(np.sum(logclass != valclass))/len(valclass), '\n'
print "BT error: ", float(np.sum(btclass != valclass))/len(valclass), '\n'
#print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
#print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])

#print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
#print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])
Training error:  0.277766666667
Validation error:  0.277633333333 

All error:  0.27774 

SVM error:  0.34052 

RF error:  0.30418 

Logistic error:  0.3245 

BT error:  0.27774 

SVM not adding to the predictions

Counting #1s and 0s in Mean prediction

In [1007]:
logtrain = graphlab.SArray.to_numpy(trainpred_log['class'])
rftrain = graphlab.SArray.to_numpy(trainpred['class'])
bttrain = graphlab.SArray.to_numpy(trainpred_bt['class'])
trains = np.column_stack((logtrain, rftrain, bttrain))
In [1005]:
alltrainpred = np.empty(len(trainpred), dtype=float)
In [1022]:
for i in range(len(alltrainpred)):
    if(np.sum(trains[i]) >= 2):
        alltrainpred[i] = 1
    else:
        alltrainpred[i] = 0
In [1023]:
alltrainpred
Out[1023]:
array([ 1.,  0.,  0., ...,  0.,  1.,  0.])
In [1026]:
print len(alltrainpred[alltrainpred==1]), len(alltrainpred[alltrainpred==0])
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
25818 124182
55072 94928

Ensembling with soft predictions

In [938]:
temp = np.column_stack((check_train_log, check_train_bt, check_train_rf))
In [942]:
print check_train_bt, trainpred_bt
[ 0.57272857  0.45087078  0.50593823 ...,  0.19499448  0.57286471
  0.43756109] +-------+----------------+----------------+
| class |  probability   |     prob2      |
+-------+----------------+----------------+
|   1   | 0.572728574276 | 0.572728574276 |
|   0   | 0.549129217863 | 0.450870782137 |
|   1   | 0.505938231945 | 0.505938231945 |
|   1   | 0.610947012901 | 0.610947012901 |
|   0   | 0.702189207077 | 0.297810792923 |
|   0   | 0.701577663422 | 0.298422336578 |
|   1   | 0.740846812725 | 0.740846812725 |
|   0   | 0.742470055819 | 0.257529944181 |
|   0   | 0.728553414345 | 0.271446585655 |
|   0   | 0.855606645346 | 0.144393354654 |
+-------+----------------+----------------+
[150000 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [998]:
np.random.seed(0)
alltrainclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, alltrainclass, 100000, 150000)

#gnb = RandomForestClassifier(max_features=None, min_samples_split=50, min_samples_leaf=50, max_depth=3)
gnb = GaussianNB()
gnb.fit(temptrain, trainclass)
gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)
print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass)
print "All error: ", float(np.sum(gnb_all_pred != alltrainclass))/len(alltrainclass), '\n'
Training error:  0.29342
Validation error:  0.29426
All error:  0.2937 

In [999]:
#y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(gnb_all_pred, alltrainclass)
print metrics.roc_auc_score(gnb_all_pred, alltrainclass)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()
0.687054323492
Out[999]:
[<matplotlib.lines.Line2D at 0x1e3b4f9d0>]
In [973]:
finalTest = np.column_stack((check_log, check_bt, check_rf))
In [974]:
finalTest
Out[974]:
array([[ 0.26656557,  0.26754108,  0.2886048 ],
       [ 0.29658706,  0.1940293 ,  0.23737937],
       [ 0.31720908,  0.47478011,  0.36365309],
       ..., 
       [ 0.32006905,  0.32459387,  0.34584743],
       [ 0.48061763,  0.47387815,  0.51016921],
       [ 0.45410944,  0.53172541,  0.50336707]])
In [986]:
# Predict on all test data
finalPred = gnb.predict(finalTest)
finalPredSoft = gnb.predict_proba(finalTest)
print len(finalPred[finalPred==1]), len(finalPred[finalPred==0])
39170 160830
In [996]:
meanPredSoft = np.mean(finalTest, axis=1)
print meanPredSoft
[ 0.27423715  0.24266525  0.3852141  ...,  0.33017012  0.48822166
  0.49640064]
In [987]:
finalPredSoft
Out[987]:
array([[ 0.80113442,  0.19886558],
       [ 0.88408362,  0.11591638],
       [ 0.51285485,  0.48714515],
       ..., 
       [ 0.69102327,  0.30897673],
       [ 0.51285485,  0.48714515],
       [ 0.39628387,  0.60371613]])
In [997]:
#np.savetxt('Yhat_finalensemble_test1.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# 0.61041 Kaggle using Random Forests

# Yet to test
#np.savetxt('Yhat_finalensemble_nb_test2.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
#Kaggle score - 0.66042

#np.savetxt('Yhat_finalensemble_rf_test3.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

#Kaggle score - 0.64273

np.savetxt('Yhat_finalensemble_mean_test4.txt', np.vstack( (np.arange(len(meanPredSoft)) , 
                                          meanPredSoft) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
In [ ]: