CS 273A Machine Learning Project¶

Team: Priyanka Ravi and Rahul Sridhar¶

Problem: Predict whether there is rainfall at a location based on (processed) infrared satellite image information.

The dataset is courtesy of UC Irvine's Center for Hydrometeorology and Remote Sensing, including Dr. Soroosh Sorooshian, Dr. Xiaogang Gao, Dr. Kuo-lin Hsu, Dan Braithwaite, Yumeng Tau, and Negar Karbalee.

# Plot inline
%matplotlib inline

import numpy as np
from scipy import stats 
import mltools as ml
import matplotlib.pyplot as plt
import pandas as pd
import project_utils as pjkt
# import sklearn # To be modified based on the exact learning algorithms required

np.random.seed(0) # Set seed to reproduce outputs

Load the data¶

X_train = np.genfromtxt('data/X_train.txt', delimiter = None)
X_test = np.genfromtxt('data/X_test.txt', delimiter = None)
Y_train = np.genfromtxt('data/Y_train.txt', delimiter = None)

# Verify data import - check number of rows (200k in train and test), columns (14)
print X_train.shape, Y_train.shape, X_test.shape

(200000, 14) (200000,) (200000, 14)

Function to print summary statistics of the input data¶

pjkt.print_summary_statistics(X_train)

Size = 200000

Min = [ 193.       190.       214.97     205.42      10.         0.         0.
    0.         0.68146    0.         0.         0.         1.0074  -999.9    ]

Max = [   253.       250.5      252.5      252.5    17130.     12338.      9238.
     35.796     19.899     11.368     21.466     14.745    278.71     782.5  ]

Mean = [  241.7972204    228.22826005   241.79629755   233.64929865  2867.97959
   884.073295     173.553355       3.04719572     6.35196722     1.92523232
     4.29379349     2.80947178    10.36791465     7.8733445 ]

Median = [  243.5      229.5      242.76     233.32    1576.       179.         0.
     2.1555     6.0169     1.4382     3.7652     2.4721     4.4687     0.    ]

Variance = [       82.69497538        90.95784934        35.72575822
        95.26133023  10619471.14179957   3257046.13084505
    740659.83692065         7.42247988         6.33233079
         4.28450846         4.0468611          1.98219294
       166.68008517      1410.80384676]

Skewness = [-1.13265567 -0.99837376 -1.23096732 -0.17701892  1.73369199  3.81828478
  7.50159872  2.61068979  1.1083718   1.53662083  1.58038185  1.21605645
  6.18547305 -7.03908282]

Kurtosis = [   1.57114747    1.21442682    2.14380763   -0.08684285    2.89426892
   17.00858034   64.01137776   10.31285653    1.91229596    2.5101101
    4.07792073    2.15174142   77.17391705  330.71488398]

# Summary statistics of Y_train
pjkt.print_summary_statistics(Y_train)
y_unique = np.unique(Y_train)
print '\nUnique Y values = ', np.unique(Y_train)
for i in range(len(np.unique(Y_train))):
    print "Number of rows with Y =", int(y_unique[i]), "is", np.sum(Y_train[:,] == y_unique[i])

Size = 200000

Min = 0.0

Max = 1.0

Mean = 0.367245

Median = 0.0

Variance = 0.232377271861

Skewness = 0.550788855253

Kurtosis = -1.69663163693

Unique Y values =  [ 0.  1.]
Number of rows with Y = 0 is 126551
Number of rows with Y = 1 is 73449

Note: Mean < Median & Mean > Median relationships not aligning with skew for some features

Rescale the data¶

X_train_scaled, _ = ml.transforms.rescale(X_train)

pjkt.print_summary_statistics(X_train_scaled) # Mean 0 and Variance 1

Size = 200000

Min = [ -5.3660734   -4.00834898  -4.48818857  -2.89229677  -0.87701852
  -0.48986612  -0.20166229  -1.11847665  -2.25341508  -0.93010874
  -2.13443577  -1.99550489  -0.72503497 -26.83061548]

Max = [  1.23193365   2.33525947   1.79078887   1.93139133   4.37653791
   6.34663512  10.5325368   12.02048577   5.38348454   4.56194336
   8.53626795   8.47753843  20.78490383  20.62339716]

Mean = [ 0.  0.  0. -0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0.]

Median = [ 0.1872492   0.13334579  0.16123277 -0.03373904 -0.3964654  -0.390682
 -0.20166229 -0.32729792 -0.13315308 -0.23529265 -0.26276272 -0.23962762
 -0.45693395 -0.20961725]

Variance = [ 1.000005  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005
  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005  1.000005]

Skewness = [-1.13265567 -0.99837376 -1.23096732 -0.17701892  1.73369199  3.81828478
  7.50159872  2.61068979  1.1083718   1.53662083  1.58038185  1.21605645
  6.18547305 -7.03908282]

Kurtosis = [   1.57114747    1.21442682    2.14380763   -0.08684285    2.89426892
   17.00858034   64.01137776   10.31285653    1.91229596    2.5101101
    4.07792073    2.15174142   77.17391705  330.71488398]

Checking for missing/NaN values¶

print np.sum(np.equal(X_train_scaled, None))
print np.sum(np.isnan(X_train_scaled))

0
0

No missing or NaN values

Plotting¶

Stacked bar chart - each feature vs. target¶

pjkt.bar_plots(X_train_scaled, Y_train)

Y = 0 is red, Y = 1 is blue

Pairwise plots¶

pjkt.pair_plots(X_train_scaled, Y_train, 0, 1)
pjkt.pair_plots(X_train_scaled, Y_train, 0, 2)
# Toggle to view output

Y = 0 is red, Y = 1 is green

Y = 0 is red, Y = 1 is green

Pairwise plots with Pandas - experiment (warning - takes a lot of time)¶

# Convert numpy array into a pandas dataframe
temp = pd.DataFrame(X_train) 
# Can change this to 0:14 to get entire dataframe; takes a lot of time

axes = pd.tools.plotting.scatter_matrix(temp)
plt.tight_layout()

Data splits for training and validation¶

xtr_sub, xte_sub, ytr_sub, yte_sub = pjkt.data_seq_split(X_train, Y_train, 10000)
print xtr_sub.shape, xte_sub.shape, ytr_sub.shape, yte_sub.shape

(10000, 14) (190000, 14) (10000,) (190000,)

Alternatively, we could use ml.splitData(X, Y, train_fraction). ml.crossValidate() can be used for cross validation.

Error plotting¶

e1 = np.random.normal(size=10)
e2 = np.random.normal(size=10)
k = np.arange(10) + 1
pjkt.plot_errors(e1, e2, k)

def plot_errors_semilog(errTrain, errValidate, x_axis, semi = 'semilogx'):
    # Plot training vs. validation error
    figure, axis1 = plt.subplots() 
    axis1.set_ylabel('Error')
    if(semi == 'semilogx'):
        axis1.semilogx(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
        axis1.semilogx(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
    else:
        axis1.semilogy(np.asarray(x_axis), errTrain, 'r-', marker = 'o')
        axis1.semilogy(np.asarray(x_axis), errValidate, 'g-', marker = 'o')
    plt.title('Train (red) vs. Validation (green) error')
    plt.show()

pjkt.plot_errors_semilog(e1, e2, k, 'semilogx')

from random import randint
e1 = [randint(10, 200) for i in range(10)]
e2 = [randint(10, 200) for i in range(10)]
pjkt.plot_errors_semilog(e1, e2, k, 'semilogy')

View variables, modules, functions defined so far¶

whos

Variable         Type              Data/Info
--------------------------------------------
X_test           ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
X_train          ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
X_train_scaled   ndarray           200000x14: 2800000 elems, type `float64`, 22400000 bytes (21 Mb)
Y_train          ndarray           200000: 200000 elems, type `float64`, 1600000 bytes (1 Mb)
e1               list              n=10
e2               list              n=10
i                int               9
k                ndarray           10: 10 elems, type `int64`, 80 bytes
ml               module            <module 'mltools' from 'mltools/__init__.pyc'>
np               module            <module 'numpy' from '/Us<...>ages/numpy/__init__.pyc'>
pd               module            <module 'pandas' from '/U<...>ges/pandas/__init__.pyc'>
pjkt             module            <module 'project_utils' from 'project_utils.pyc'>
plt              module            <module 'matplotlib.pyplo<...>s/matplotlib/pyplot.pyc'>
randint          instancemethod    <bound method Random.rand<...>m object at 0x10085d620>>
stats            module            <module 'scipy.stats' fro<...>cipy/stats/__init__.pyc'>
xte_sub          ndarray           190000x14: 2660000 elems, type `float64`, 21280000 bytes (20 Mb)
xtr_sub          ndarray           10000x14: 140000 elems, type `float64`, 1120000 bytes (1 Mb)
y_unique         ndarray           2: 2 elems, type `float64`, 16 bytes
yte_sub          ndarray           190000: 190000 elems, type `float64`, 1520000 bytes (1 Mb)
ytr_sub          ndarray           10000: 10000 elems, type `float64`, 80000 bytes

Model building¶

Experiment 1 - SVM¶

from sklearn import svm

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 200000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

(150000, 14) (50000, 14) (150000,) (50000,)

Bare-bones SVM with class weighting¶

## learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7}, probability=True) - Submitted to Kaggle: 0.64796
learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.7})#, probability=True) 
# Performs better on training and validation - not on kaggle 0.64470
learner.fit(xtr_sub, ytr_sub)

SVC(C=1.0, cache_size=200, class_weight={0.0: 0.5, 1.0: 0.7}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
#ytest_pred_soft = learner.predict_proba(X_test_scaled)

print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub), '\n'
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])

print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])

Training error:  0.3034
Validation error:  0.3149 

7240 12760
4662 15338
3675 6325
2266 7734

# Save the predictions in the format required by Kaggle - weights 0.5, 0.7
#np.savetxt('Yhat_svm_test.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
#                                          ytest_pred_soft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64796

# Weights 0.5, 0.6
np.savetxt('Yhat_svm_test2.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

To do - Vary:
a) Amount of data used for training and validation
b) Features, selection method and their transforms
c) Kernel
d) Regularization parameters
e) Plot errors as function of above Upload on Kaggle

Roc curve¶

y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)

from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(ytr_sub, y_score)
print metrics.roc_auc_score(ytr_sub, ytr_pred)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()

0.633072533296

[<matplotlib.lines.Line2D at 0x1e4b27ed0>]

Balanced - SVM¶

learner = svm.SVC(class_weight = 'balanced')
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print float(len(ytr_sub[ytr_sub==1])/float((len(ytr_sub[ytr_sub==0]))))

7240 12760
3675 6325
0.567398119122

Subset of features¶

featlist = [0, 6, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),

Training error:  0.35305
Validation error:  0.3611

# feature 7 has a lot of zeros
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
learner = svm.SVC()
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),

Training error:  0.3018
Validation error:  0.3161

Explicit balanced sampling¶

xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)

print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.SVC()
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])

10915 10915
7367 7367

Linear SVC¶

np.random.seed(0)
X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 150000, 50000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False) # Balanced weighting not working out
learner.fit(xtr_sub, ytr_sub)
ytr_pred = learner.predict(xtr_sub)
yval_pred = learner.predict(xval_sub)
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),

(20000, 14) (10000, 14) (20000,) (10000,)
Training error:  0.31685
Validation error:  0.326

Subset of features¶

featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 20000, 40000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

(20000, 14) (20000, 14) (20000,) (20000,)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-41-9a1dcfaa70f3> in <module>()
      7 print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape
      8 
----> 9 learner = svm.LinearSVC(dual=False)
     10 learner.fit(xtr_sub[:,featlist] , ytr_sub)
     11 ytr_pred = learner.predict(xtr_sub[:,featlist])

NameError: name 'svm' is not defined

Explicit balanced sampling¶

featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:40000,], Y_train[:40000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[40000:50000,], Y_train[40000:50000,], 1.0)

print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = svm.LinearSVC(dual=False)
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub),

(29190, 14) (7374, 14) (29190,) (7374,)
Training error:  0.390167865707
Validation error:  0.39042582045

Experiment 2 - Neural Networks¶

from sklearn.neural_network import MLPClassifier

featlist = [0, 13, 8, 2, 12, 3, 9, 1]
#featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

(40000, 14) (30000, 14) (40000,) (30000,)
Training error:  0.309525
Validation error:  0.3188

ytest_pred_soft = learner.predict_proba(X_test_scaled)

# Kaggle for - 20k training, 10k validation
# learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
np.savetxt('Yhat_nn_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

# ~0.60 kaggle score

Explicit balanced sampling¶

featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
xtr_sub, ytr_sub = pjkt.balanced_subsample(X_train_scaled[:30000,], Y_train[:30000,], 1.0)
xval_sub, yval_sub = pjkt.balanced_subsample(X_train_scaled[30000:50000,], Y_train[30000:50000,], 1.0)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

learner = MLPClassifier(hidden_layer_sizes=(220,), learning_rate='invscaling', solver='lbfgs')
learner.fit(xtr_sub[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_sub[:,featlist])
yval_pred = learner.predict(xval_sub[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

(21830, 14) (14734, 14) (21830,) (14734,)
Training error:  0.306825469537
Validation error:  0.358694176734

Miscellaneous¶

Clustering¶

X_train_scaled, _ = ml.transforms.rescale(X_train)
X_test_scaled, _ = ml.transforms.rescale(X_test)
xtr_sub, xval_sub, ytr_sub, yval_sub = pjkt.data_seq_split(X_train_scaled, Y_train, 40000, 70000)
print xtr_sub.shape, xval_sub.shape, ytr_sub.shape, yval_sub.shape

(40000, 14) (30000, 14) (40000,) (30000,)

num_clusters = [2]
#initial = ['random', 'farthest', 'k++'] # Different initialization methods
initial = ['k++']
np.random.seed(0)

for i, k in enumerate(num_clusters):
    for j in range(len(initial)):
        z, c, sumd = ml.cluster.kmeans(X = xtr_sub, K = k, init = initial[j])
        print str(k), " clusters with initialization:", initial[j]
        print "Sum of squared Euclidean distances:", str(sumd)
        
print "Ratio of 1s and 0s (training) = ",len(z[z==1]), len(z[z==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)

2  clusters with initialization: random
Sum of squared Euclidean distances: 400550.831045

num_clusters = [2]
initial = ['k++']
np.random.seed(0)
featlist = [0, 13, 8, 2, 12]

for i, k in enumerate(num_clusters):
    for j in range(len(initial)):
        #z, T, soft, ll = ml.cluster.gmmEM(X = xtr_sub[:,featlist], K = k, init = initial[j])
        #zval, T, soft, llval = ml.cluster.gmmEM(X = xval_sub[:,featlist], K = k, init = initial[j])
        ztest, T, soft, lltest = ml.cluster.gmmEM(X = X_test_scaled, K = k, init = initial[j])
        print str(k), " clusters with initialization: ", initial[j]
        print "Log likelihood (training): ", str(ll), "\n"
        print "Log likelihood (validation): ", str(llval), "\n"
z = 1 - z
zval = 1 - zval
ztest = 1 - ztest
#zval[zval==1] = 0
#zval[zval==0] = 1
        
print "Ratio of 1s and 0s = ",len(z[z==1]), len(z[z==0])
print "Ratio of 1s and 0s in original training= ",len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
print "Training error: ", float(np.sum(z != ytr_sub))/len(ytr_sub)
print "Ratio of 1s and 0s (validation) = ",len(zval[zval==1]), len(zval[zval==0])
print "Ratio of 1s and 0s in original validation= ",len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
print "Validation error: ", float(np.sum(zval != yval_sub))/len(yval_sub)

2  clusters with initialization:  k++
Log likelihood (training):  749677.42686 

Log likelihood (validation):  543676.114293 

Ratio of 1s and 0s =  29574 10426
Ratio of 1s and 0s in original training=  14595 25405
Training error:  0.644475
Ratio of 1s and 0s (validation) =  22089 7911
Ratio of 1s and 0s in original validation=  11018 18982
Validation error:  0.6421

# Append the clusters to the training data
check = np.reshape(z, newshape=(len(z), 1))
print check.shape
xtr_new = np.append(xtr_sub, check, 1)
print xtr_new.shape

check = np.reshape(zval, newshape=(len(zval), 1))
print check.shape
xval_new = np.append(xval_sub, check, 1)
print xval_new.shape

(40000, 1)
(40000, 15)
(30000, 1)
(30000, 15)

# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
learner = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner.predict(xtr_new[:,featlist])
yval_pred = learner.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

Training error:  0.2987
Validation error:  0.310533333333

# Learn a svm and nn on the new training data
np.random.seed(0)
featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
#featlist = [0, 13, 8, 2, 14]
# Doesn't have probabilities learner = svm.LinearSVC(dual=False)
# Too Slow don't run again - learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
learner2 = MLPClassifier(hidden_layer_sizes=(220,), alpha=0.001)
#learner = MLPClassifier(hidden_layer_sizes=(220,), early_stopping=True)
learner2.fit(xtr_new[:,featlist] , ytr_sub)
ytr_pred = learner2.predict(xtr_new[:,featlist])
yval_pred = learner2.predict(xval_new[:,featlist])
print "Training error: ", float(np.sum(ytr_pred != ytr_sub))/len(ytr_sub)
print "Validation error: ", float(np.sum(yval_pred != yval_sub))/len(yval_sub)

Training error:  0.298975
Validation error:  0.3093

featlist = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
check = np.reshape(ztest, newshape=(len(ztest), 1))
print check.shape
xtest_new = np.append(X_test_scaled[:, featlist], check, 1)
print xtest_new.shape
ytest_pred_soft = learner2.predict_proba(xtest_new)

(200000, 1)
(200000, 15)

# learner = svm.SVC(class_weight={0.0:0.5, 1.0:0.6}, probability=True) 
np.savetxt('Yhat_nncluster_test1.txt', np.vstack( (np.arange(len(ytest_pred_soft)) , 
                                          ytest_pred_soft[:, 1]) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle - 0.64715

Feature transformations¶

for i in range(14):
    plt.scatter(xtr_new[:,i], ytr_sub)
    plt.title('Feature '+str(i+1))
    plt.show()

#plt.scatter((X_train_scaled[:,0]), Y_train)

plt.scatter(np.exp(220 + xtr_new[:,6]), ytr_sub)

<matplotlib.collections.PathCollection at 0x13161ef90>

corrList = []
for i in range(14):
    print "Feature ", i
    print np.corrcoef(xtr_new[:, i], ytr_sub)

Feature  0
[[ 1.         -0.22952349]
 [-0.22952349  1.        ]]
Feature  1
[[ 1.         -0.09297603]
 [-0.09297603  1.        ]]
Feature  2
[[ 1.         -0.11647034]
 [-0.11647034  1.        ]]
Feature  3
[[ 1.         -0.10206507]
 [-0.10206507  1.        ]]
Feature  4
[[ 1.          0.02198886]
 [ 0.02198886  1.        ]]
Feature  5
[[ 1.          0.05888135]
 [ 0.05888135  1.        ]]
Feature  6
[[ 1.         -0.00451683]
 [-0.00451683  1.        ]]
Feature  7
[[ 1.         -0.03667107]
 [-0.03667107  1.        ]]
Feature  8
[[ 1.          0.08914196]
 [ 0.08914196  1.        ]]
Feature  9
[[ 1.          0.07457092]
 [ 0.07457092  1.        ]]
Feature  10
[[ 1.          0.02064326]
 [ 0.02064326  1.        ]]
Feature  11
[[ 1.          0.08576368]
 [ 0.08576368  1.        ]]
Feature  12
[[ 1.         -0.10108739]
 [-0.10108739  1.        ]]
Feature  13
[[ 1.          0.10679203]
 [ 0.10679203  1.        ]]

Graphlab¶

import graphlab

sf_xtrain = graphlab.SFrame.read_csv('data/X_train.txt', header = False, delimiter=' ')
sf_xtest = graphlab.SFrame.read_csv('data/X_test.txt', header=False, delimiter=' ')
sf_ytrain = graphlab.SFrame.read_csv('data/Y_train.txt', header=False, delimiter=' ')

Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_train.txt

Parsing completed. Parsed 100 lines in 0.843793 secs.

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[float,float,float,float,float,float,float,float,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

Read 149796 lines. Lines per second: 155526

Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_train.txt

Parsing completed. Parsed 200000 lines in 1.04887 secs.

Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_test.txt

Parsing completed. Parsed 100 lines in 0.808939 secs.

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[float,float,float,float,float,float,float,float,float,float,float,float,float,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

Finished parsing file /Users/rahulsridhar/Documents/Courses/CS 273A ML/Project/Code/data/X_test.txt

sf_xtrain.print_rows

<bound method SFrame.print_rows of Columns:
	X1	float
	X2	float
	X3	float
	X4	float
	X5	float
	X6	float
	X7	float
	X8	float
	X9	float
	X10	float
	X11	float
	X12	float
	X13	float
	X14	float

Rows: 200000

Data:
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+
|   X10   |  X11   |  X12   |  X13   | X14  |
+---------+--------+--------+--------+------+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 |
+---------+--------+--------+--------+------+
[200000 rows x 14 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

graphlab.canvas.set_target('browser')

sf_xtrain.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

sf_alltrain = sf_xtrain.add_column(graphlab.SArray(Y_train))

sf_alltrain.print_rows

<bound method SFrame.print_rows of Columns:
	X1	float
	X2	float
	X3	float
	X4	float
	X5	float
	X6	float
	X7	float
	X8	float
	X9	float
	X10	float
	X11	float
	X12	float
	X13	float
	X14	float
	X15	float

Rows: 200000

Data:
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+-----+
|   X10   |  X11   |  X12   |  X13   | X14  | X15 |
+---------+--------+--------+--------+------+-----+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  | 1.0 |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  | 0.0 |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  | 1.0 |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  | 1.0 |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  | 0.0 |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  | 0.0 |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 | 1.0 |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  | 0.0 |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  | 0.0 |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 | 0.0 |
+---------+--------+--------+--------+------+-----+
[200000 rows x 15 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

sf_alltrain['X15'] = sf_alltrain['X15'].astype(int)

sf_subtrain = sf_alltrain[0:100000]
sf_subval = sf_alltrain[100000:150000]
print sf_subtrain.print_rows()

+-------+-------+--------+--------+---------+--------+--------+---------+--------+
|   X1  |   X2  |   X3   |   X4   |    X5   |   X6   |   X7   |    X8   |   X9   |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
| 236.0 | 227.0 | 241.12 | 233.41 |  1292.0 | 341.0  |  0.0   |  1.5895 | 6.8949 |
| 235.0 | 227.0 | 241.62 | 232.05 |  8019.0 | 1691.0 |  0.0   |  1.0837 | 6.7184 |
| 238.0 | 224.0 | 237.62 | 230.35 |  3144.0 | 1570.0 |  0.0   |  1.3363 | 8.4899 |
| 222.5 | 206.0 | 228.5  | 217.11 | 11879.0 | 9258.0 | 2733.0 | 0.98712 | 9.9585 |
| 247.0 | 239.0 | 249.78 | 249.78 |  1002.0 |  0.0   |  0.0   |  2.6542 | 2.7207 |
| 240.0 | 230.0 | 243.41 | 234.17 |  3238.0 | 368.0  |  0.0   |  3.1392 | 5.7641 |
| 230.0 | 223.0 | 239.7  | 230.63 |  9476.0 | 3042.0 |  0.0   |  2.8204 | 7.5664 |
| 242.0 | 227.0 | 239.76 | 233.49 |  7440.0 | 2139.0 |  0.0   |  3.3187 | 5.8587 |
| 250.0 | 232.0 | 242.09 | 234.23 |  1006.0 | 140.0  |  0.0   |  2.1183 | 5.5077 |
| 230.0 | 224.0 | 239.37 | 230.42 |  1049.0 | 421.0  |  0.0   |  2.0987 | 8.5274 |
+-------+-------+--------+--------+---------+--------+--------+---------+--------+
+---------+--------+--------+--------+------+-----+
|   X10   |  X11   |  X12   |  X13   | X14  | X15 |
+---------+--------+--------+--------+------+-----+
|  1.6344 | 3.8364 | 3.0291 | 2.4833 | 2.1  |  1  |
|  2.196  | 2.2024 | 1.7191 | 2.0596 | 3.1  |  0  |
|  2.9214 | 3.3413 | 2.1083 | 1.4458 | 0.0  |  1  |
|  6.0723 | 3.0144 | 3.5979 | 1.169  | 0.0  |  1  |
|   0.0   | 3.1327 | 2.1344 |  20.0  | 0.0  |  0  |
| 0.97109 |  3.15  | 1.6813 | 5.2191 | 0.0  |  0  |
|  3.1682 | 2.3912 | 2.3238 | 1.6033 | 22.7 |  1  |
|  1.454  | 2.6661 | 2.2898 | 3.4905 | 0.0  |  0  |
| 0.70562 | 3.3933 | 1.5066 | 5.9754 | 0.0  |  0  |
|  2.7638 | 5.6143 | 2.8867 | 7.2383 | 22.9 |  0  |
+---------+--------+--------+--------+------+-----+
[100000 rows x 15 columns]

None

Neural Network¶

nn = graphlab.neuralnet_classifier.create(sf_subtrain, 'X15')

Using network:

### network layers ###
layer[0]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 10
layer[1]: SigmoidLayer
layer[2]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 2
layer[3]: SoftmaxLayer
### end network layers ###

### network parameters ###
learning_rate = 0.001
momentum = 0.9
### end network parameters ###

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Creating neuralnet using cpu

Training with batch size = 100

+-----------+----------+--------------+-------------------+---------------------+-----------------+

| Iteration | Examples | Elapsed Time | Training-accuracy | Validation-accuracy | Examples/second |

+-----------+----------+--------------+-------------------+---------------------+-----------------+

| 1         | 95000    | 0.218257     | 0.632200          | 0.632049            | 435267.031250   |

| 2         | 95000    | 0.404762     | 0.632568          | 0.632049            | 509557.218750   |

| 3         | 95000    | 0.592937     | 0.632568          | 0.632049            | 505052.437500   |

| 4         | 95000    | 0.783011     | 0.632568          | 0.632049            | 499959.375000   |

pred = nn.classify(sf_subtrain)
pred.print_rows(1000)

+--------+-------+----------------+
| row_id | class |  probability   |
+--------+-------+----------------+
|   0    |   0   | 0.601708769798 |
|   1    |   0   | 0.601708769798 |
|   2    |   0   | 0.601708769798 |
|   3    |   0   | 0.601708769798 |
|   4    |   0   | 0.601708769798 |
|   5    |   0   | 0.601708769798 |
|   6    |   0   | 0.601708769798 |
|   7    |   0   | 0.601708769798 |
|   8    |   0   | 0.601708769798 |
|   9    |   0   | 0.601708769798 |
|   10   |   0   | 0.601708769798 |
|   11   |   0   | 0.601708769798 |
|   12   |   0   | 0.601708769798 |
|   13   |   0   | 0.601708769798 |
|   14   |   0   | 0.601708769798 |
|   15   |   0   | 0.601708769798 |
|   16   |   0   | 0.601707875729 |
|   17   |   0   | 0.601708769798 |
|   18   |   0   | 0.601708769798 |
|   19   |   0   | 0.601708769798 |
|   20   |   0   | 0.601708769798 |
|   21   |   0   | 0.692799866199 |
|   22   |   0   | 0.601708769798 |
|   23   |   0   | 0.601708769798 |
|   24   |   0   | 0.601708769798 |
|   25   |   0   | 0.601708769798 |
|   26   |   0   | 0.601708769798 |
|   27   |   0   | 0.65694963932  |
|   28   |   0   | 0.601708769798 |
|   29   |   0   | 0.601710140705 |
|   30   |   0   | 0.601708769798 |
|   31   |   0   | 0.601708769798 |
|   32   |   0   | 0.703370630741 |
|   33   |   0   | 0.601708769798 |
|   34   |   0   | 0.601708769798 |
|   35   |   0   | 0.601708769798 |
|   36   |   0   | 0.601708769798 |
|   37   |   0   | 0.601708769798 |
|   38   |   0   | 0.601708769798 |
|   39   |   0   | 0.601708769798 |
|   40   |   0   | 0.601708710194 |
|   41   |   0   | 0.601708769798 |
|   42   |   0   | 0.601708769798 |
|   43   |   0   | 0.601708769798 |
|   44   |   0   | 0.601708769798 |
|   45   |   0   | 0.705749750137 |
|   46   |   0   | 0.601708769798 |
|   47   |   0   | 0.601708769798 |
|   48   |   0   | 0.601708769798 |
|   49   |   0   | 0.601708769798 |
|   50   |   0   | 0.601708769798 |
|   51   |   0   | 0.601708769798 |
|   52   |   0   | 0.601708769798 |
|   53   |   0   | 0.601708769798 |
|   54   |   0   | 0.601708769798 |
|   55   |   0   | 0.601708650589 |
|   56   |   0   | 0.601708769798 |
|   57   |   0   | 0.601708710194 |
|   58   |   0   | 0.601708769798 |
|   59   |   0   | 0.601708769798 |
|   60   |   0   | 0.601708710194 |
|   61   |   0   | 0.601708769798 |
|   62   |   0   | 0.710985302925 |
|   63   |   0   | 0.601708769798 |
|   64   |   0   | 0.601708769798 |
|   65   |   0   | 0.71114051342  |
|   66   |   0   | 0.601708769798 |
|   67   |   0   | 0.711132824421 |
|   68   |   0   | 0.601708769798 |
|   69   |   0   | 0.601708769798 |
|   70   |   0   | 0.601708769798 |
|   71   |   0   | 0.601708769798 |
|   72   |   0   | 0.601708769798 |
|   73   |   0   | 0.601708769798 |
|   74   |   0   | 0.601708769798 |
|   75   |   0   | 0.675784647465 |
|   76   |   0   | 0.601708769798 |
|   77   |   0   | 0.60205757618  |
|   78   |   0   | 0.601708769798 |
|   79   |   0   | 0.711086153984 |
|   80   |   0   | 0.601708769798 |
|   81   |   0   | 0.601708769798 |
|   82   |   0   | 0.601708769798 |
|   83   |   0   | 0.601708769798 |
|   84   |   0   | 0.601708769798 |
|   85   |   0   | 0.710575640202 |
|   86   |   0   | 0.601708769798 |
|   87   |   0   | 0.601708769798 |
|   88   |   0   | 0.601708769798 |
|   89   |   0   | 0.601708769798 |
|   90   |   0   | 0.601708769798 |
|   91   |   0   | 0.601708769798 |
|   92   |   0   | 0.601705908775 |
|   93   |   0   | 0.601708769798 |
|   94   |   0   | 0.601708769798 |
|   95   |   0   | 0.601708769798 |
|   96   |   0   | 0.601862430573 |
|   97   |   0   | 0.711144387722 |
|   98   |   0   | 0.601708769798 |
|   99   |   0   | 0.601708769798 |
|  100   |   0   | 0.601708769798 |
|  101   |   0   | 0.601708769798 |
|  102   |   0   | 0.601708769798 |
|  103   |   0   | 0.601708769798 |
|  104   |   0   | 0.708188951015 |
|  105   |   0   | 0.601708710194 |
|  106   |   0   | 0.601708769798 |
|  107   |   0   | 0.601708769798 |
|  108   |   0   | 0.602389514446 |
|  109   |   0   | 0.709271371365 |
|  110   |   0   | 0.601708769798 |
|  111   |   0   | 0.601708769798 |
|  112   |   0   | 0.601708769798 |
|  113   |   0   | 0.601708769798 |
|  114   |   0   | 0.601708590984 |
|  115   |   0   | 0.601708769798 |
|  116   |   0   | 0.601708769798 |
|  117   |   0   | 0.601708650589 |
|  118   |   0   | 0.601708769798 |
|  119   |   0   | 0.601708769798 |
|  120   |   0   | 0.711139559746 |
|  121   |   0   | 0.711151242256 |
|  122   |   0   | 0.601708769798 |
|  123   |   0   | 0.601708769798 |
|  124   |   0   | 0.601708769798 |
|  125   |   0   | 0.601708769798 |
|  126   |   0   | 0.601708769798 |
|  127   |   0   | 0.601708710194 |
|  128   |   0   | 0.601708769798 |
|  129   |   0   | 0.601708769798 |
|  130   |   0   | 0.601707100868 |
|  131   |   0   | 0.601708769798 |
|  132   |   0   | 0.711142420769 |
|  133   |   0   | 0.626559317112 |
|  134   |   0   | 0.601708769798 |
|  135   |   0   | 0.601708769798 |
|  136   |   0   | 0.601708769798 |
|  137   |   0   | 0.601708769798 |
|  138   |   0   | 0.605115175247 |
|  139   |   0   | 0.601708710194 |
|  140   |   0   | 0.601708769798 |
|  141   |   0   | 0.601708710194 |
|  142   |   0   | 0.601708769798 |
|  143   |   0   | 0.601708769798 |
|  144   |   0   | 0.601708769798 |
|  145   |   0   | 0.601708650589 |
|  146   |   0   | 0.601708769798 |
|  147   |   0   | 0.601708769798 |
|  148   |   0   | 0.601708769798 |
|  149   |   0   | 0.602511882782 |
|  150   |   0   | 0.663577258587 |
|  151   |   0   | 0.601708769798 |
|  152   |   0   | 0.601708710194 |
|  153   |   0   | 0.699935853481 |
|  154   |   0   | 0.711049556732 |
|  155   |   0   | 0.601708769798 |
|  156   |   0   | 0.601706504822 |
|  157   |   0   | 0.601708769798 |
|  158   |   0   | 0.601708769798 |
|  159   |   0   | 0.606579899788 |
|  160   |   0   | 0.601708769798 |
|  161   |   0   | 0.601708769798 |
|  162   |   0   | 0.601708769798 |
|  163   |   0   | 0.601708769798 |
|  164   |   0   | 0.601708710194 |
|  165   |   0   | 0.601708769798 |
|  166   |   0   | 0.601708769798 |
|  167   |   0   | 0.601708769798 |
|  168   |   0   | 0.601708769798 |
|  169   |   0   | 0.601708769798 |
|  170   |   0   | 0.60170841217  |
|  171   |   0   | 0.601708769798 |
|  172   |   0   | 0.601708769798 |
|  173   |   0   | 0.601708769798 |
|  174   |   0   | 0.601708769798 |
|  175   |   0   | 0.60171097517  |
|  176   |   0   | 0.601708769798 |
|  177   |   0   | 0.601708590984 |
|  178   |   0   | 0.601708769798 |
|  179   |   0   | 0.601708054543 |
|  180   |   0   | 0.601708769798 |
|  181   |   0   | 0.601708769798 |
|  182   |   0   | 0.60170841217  |
|  183   |   0   | 0.601708769798 |
|  184   |   0   | 0.644745886326 |
|  185   |   0   | 0.601708769798 |
|  186   |   0   | 0.601708769798 |
|  187   |   0   | 0.71114385128  |
|  188   |   0   | 0.601708769798 |
|  189   |   0   | 0.601708769798 |
|  190   |   0   | 0.601707518101 |
|  191   |   0   | 0.707729458809 |
|  192   |   0   | 0.601708769798 |
|  193   |   0   | 0.601708769798 |
|  194   |   0   | 0.601708769798 |
|  195   |   0   | 0.601708769798 |
|  196   |   0   | 0.601708769798 |
|  197   |   0   | 0.601708769798 |
|  198   |   0   | 0.601708769798 |
|  199   |   0   | 0.601708769798 |
|  200   |   0   | 0.601708769798 |
|  201   |   0   | 0.601708769798 |
|  202   |   0   | 0.601708769798 |
|  203   |   0   | 0.601708769798 |
|  204   |   0   | 0.601708769798 |
|  205   |   0   | 0.601708769798 |
|  206   |   0   | 0.601708769798 |
|  207   |   0   | 0.601708769798 |
|  208   |   0   | 0.601708769798 |
|  209   |   0   | 0.709246098995 |
|  210   |   0   | 0.601708769798 |
|  211   |   0   | 0.601708769798 |
|  212   |   0   | 0.601708769798 |
|  213   |   0   | 0.601708769798 |
|  214   |   0   | 0.601708769798 |
|  215   |   0   | 0.601708769798 |
|  216   |   0   | 0.601708769798 |
|  217   |   0   | 0.710933506489 |
|  218   |   0   | 0.601708769798 |
|  219   |   0   | 0.711134016514 |
|  220   |   0   | 0.601708769798 |
|  221   |   0   | 0.601708769798 |
|  222   |   0   | 0.601708769798 |
|  223   |   0   | 0.601708769798 |
|  224   |   0   | 0.601708769798 |
|  225   |   0   | 0.601708769798 |
|  226   |   0   | 0.601708769798 |
|  227   |   0   | 0.601708710194 |
|  228   |   0   | 0.601708769798 |
|  229   |   0   | 0.601708769798 |
|  230   |   0   | 0.601708769798 |
|  231   |   0   | 0.601708769798 |
|  232   |   0   | 0.601708769798 |
|  233   |   0   | 0.601708769798 |
|  234   |   0   | 0.601708769798 |
|  235   |   0   | 0.601708769798 |
|  236   |   0   | 0.601708769798 |
|  237   |   0   | 0.601708769798 |
|  238   |   0   | 0.601708769798 |
|  239   |   0   | 0.601708769798 |
|  240   |   0   | 0.601708769798 |
|  241   |   0   | 0.601708769798 |
|  242   |   0   | 0.601708769798 |
|  243   |   0   | 0.601708769798 |
|  244   |   0   | 0.601708769798 |
|  245   |   0   | 0.601708769798 |
|  246   |   0   | 0.601708590984 |
|  247   |   0   | 0.601708769798 |
|  248   |   0   | 0.601803898811 |
|  249   |   0   | 0.711138010025 |
|  250   |   0   | 0.601708769798 |
|  251   |   0   | 0.601708769798 |
|  252   |   0   | 0.601708769798 |
|  253   |   0   | 0.711127877235 |
|  254   |   0   | 0.601708769798 |
|  255   |   0   | 0.601708769798 |
|  256   |   0   | 0.601708769798 |
|  257   |   0   | 0.601708769798 |
|  258   |   0   | 0.601708769798 |
|  259   |   0   | 0.601708769798 |
|  260   |   0   | 0.601708769798 |
|  261   |   0   | 0.601708769798 |
|  262   |   0   | 0.601708769798 |
|  263   |   0   | 0.601708769798 |
|  264   |   0   | 0.706897616386 |
|  265   |   0   | 0.601708769798 |
|  266   |   0   | 0.601708769798 |
|  267   |   0   | 0.601708769798 |
|  268   |   0   | 0.601708769798 |
|  269   |   0   | 0.707054376602 |
|  270   |   0   | 0.711016118526 |
|  271   |   0   | 0.601708769798 |
|  272   |   0   | 0.601708769798 |
|  273   |   0   | 0.601708769798 |
|  274   |   0   | 0.601708769798 |
|  275   |   0   | 0.601708769798 |
|  276   |   0   | 0.601708769798 |
|  277   |   0   | 0.601708769798 |
|  278   |   0   | 0.601708769798 |
|  279   |   0   | 0.71108096838  |
|  280   |   0   | 0.601708769798 |
|  281   |   0   | 0.601708769798 |
|  282   |   0   | 0.601708769798 |
|  283   |   0   | 0.601708769798 |
|  284   |   0   | 0.601708769798 |
|  285   |   0   | 0.601708769798 |
|  286   |   0   | 0.601708769798 |
|  287   |   0   | 0.601708769798 |
|  288   |   0   | 0.601708769798 |
|  289   |   0   | 0.711139380932 |
|  290   |   0   | 0.601708769798 |
|  291   |   0   | 0.601708769798 |
|  292   |   0   | 0.601708769798 |
|  293   |   0   | 0.601708769798 |
|  294   |   0   | 0.601708769798 |
|  295   |   0   | 0.601708769798 |
|  296   |   0   | 0.601708710194 |
|  297   |   0   | 0.601708710194 |
|  298   |   0   | 0.601708769798 |
|  299   |   0   | 0.71088296175  |
|  300   |   0   | 0.601708769798 |
|  301   |   0   | 0.601708769798 |
|  302   |   0   | 0.601708769798 |
|  303   |   0   | 0.601707935333 |
|  304   |   0   | 0.601708769798 |
|  305   |   0   | 0.601708769798 |
|  306   |   0   | 0.601708769798 |
|  307   |   0   | 0.601708292961 |
|  308   |   0   | 0.601708769798 |
|  309   |   0   | 0.601711750031 |
|  310   |   0   | 0.601708769798 |
|  311   |   0   | 0.601708769798 |
|  312   |   0   | 0.601708769798 |
|  313   |   0   | 0.601708769798 |
|  314   |   0   | 0.601708769798 |
|  315   |   0   | 0.601708769798 |
|  316   |   0   | 0.616813957691 |
|  317   |   0   | 0.601708769798 |
|  318   |   0   | 0.601708769798 |
|  319   |   0   | 0.601708769798 |
|  320   |   0   | 0.601708710194 |
|  321   |   0   | 0.709868729115 |
|  322   |   0   | 0.601708769798 |
|  323   |   0   | 0.601708769798 |
|  324   |   0   | 0.601708769798 |
|  325   |   0   | 0.601708769798 |
|  326   |   0   | 0.621520102024 |
|  327   |   0   | 0.601708769798 |
|  328   |   0   | 0.601708769798 |
|  329   |   0   | 0.601708710194 |
|  330   |   0   | 0.601708769798 |
|  331   |   0   | 0.601708769798 |
|  332   |   0   | 0.601708769798 |
|  333   |   0   | 0.711146652699 |
|  334   |   0   | 0.601708769798 |
|  335   |   0   | 0.601708769798 |
|  336   |   0   | 0.601708769798 |
|  337   |   0   | 0.601708769798 |
|  338   |   0   | 0.601708710194 |
|  339   |   0   | 0.601708769798 |
|  340   |   0   | 0.601708769798 |
|  341   |   0   | 0.601708769798 |
|  342   |   0   | 0.699524462223 |
|  343   |   0   | 0.601708769798 |
|  344   |   0   | 0.601708769798 |
|  345   |   0   | 0.601708710194 |
|  346   |   0   | 0.601708769798 |
|  347   |   0   | 0.601708769798 |
|  348   |   0   | 0.601708769798 |
|  349   |   0   | 0.601769685745 |
|  350   |   0   | 0.601708769798 |
|  351   |   0   | 0.608652412891 |
|  352   |   0   | 0.612375736237 |
|  353   |   0   | 0.601796865463 |
|  354   |   0   | 0.601708769798 |
|  355   |   0   | 0.601708590984 |
|  356   |   0   | 0.601708769798 |
|  357   |   0   | 0.601708769798 |
|  358   |   0   | 0.601708769798 |
|  359   |   0   | 0.601708769798 |
|  360   |   0   | 0.601708650589 |
|  361   |   0   | 0.601708650589 |
|  362   |   0   | 0.601708769798 |
|  363   |   0   | 0.601708769798 |
|  364   |   0   | 0.601707994938 |
|  365   |   0   | 0.601708769798 |
|  366   |   0   | 0.601708769798 |
|  367   |   0   | 0.601708769798 |
|  368   |   0   | 0.601707756519 |
|  369   |   0   | 0.601708769798 |
|  370   |   0   | 0.601708769798 |
|  371   |   0   | 0.601708769798 |
|  372   |   0   | 0.601708769798 |
|  373   |   0   | 0.601708769798 |
|  374   |   0   | 0.601708769798 |
|  375   |   0   | 0.690859675407 |
|  376   |   0   | 0.601708769798 |
|  377   |   0   | 0.601708769798 |
|  378   |   0   | 0.607296705246 |
|  379   |   0   | 0.601707339287 |
|  380   |   0   | 0.601708769798 |
|  381   |   0   | 0.601708590984 |
|  382   |   0   | 0.711107492447 |
|  383   |   0   | 0.689677238464 |
|  384   |   0   | 0.601708769798 |
|  385   |   0   | 0.601708769798 |
|  386   |   0   | 0.601708769798 |
|  387   |   0   | 0.601708769798 |
|  388   |   0   | 0.601708769798 |
|  389   |   0   | 0.601708769798 |
|  390   |   0   | 0.601708769798 |
|  391   |   0   | 0.601708769798 |
|  392   |   0   | 0.601708769798 |
|  393   |   0   | 0.601706385612 |
|  394   |   0   | 0.601708769798 |
|  395   |   0   | 0.711059331894 |
|  396   |   0   | 0.601708769798 |
|  397   |   0   | 0.601708769798 |
|  398   |   0   | 0.601708769798 |
|  399   |   0   | 0.614991128445 |
|  400   |   0   | 0.710083663464 |
|  401   |   0   | 0.601708769798 |
|  402   |   0   | 0.601708769798 |
|  403   |   0   | 0.601708769798 |
|  404   |   0   | 0.601708769798 |
|  405   |   0   | 0.601718068123 |
|  406   |   0   | 0.601708769798 |
|  407   |   0   | 0.601708769798 |
|  408   |   0   | 0.710230231285 |
|  409   |   0   | 0.601708769798 |
|  410   |   0   | 0.601708769798 |
|  411   |   0   | 0.709443330765 |
|  412   |   0   | 0.601708769798 |
|  413   |   0   | 0.711143732071 |
|  414   |   0   | 0.601711809635 |
|  415   |   0   | 0.601708710194 |
|  416   |   0   | 0.601708769798 |
|  417   |   0   | 0.601708769798 |
|  418   |   0   | 0.601708769798 |
|  419   |   0   | 0.601708769798 |
|  420   |   0   | 0.601708710194 |
|  421   |   0   | 0.601708769798 |
|  422   |   0   | 0.601708769798 |
|  423   |   0   | 0.601708769798 |
|  424   |   0   | 0.601708769798 |
|  425   |   0   | 0.601708769798 |
|  426   |   0   | 0.601708769798 |
|  427   |   0   | 0.650603353977 |
|  428   |   0   | 0.601708710194 |
|  429   |   0   | 0.601708769798 |
|  430   |   0   | 0.601708710194 |
|  431   |   0   | 0.601708769798 |
|  432   |   0   | 0.601708769798 |
|  433   |   0   | 0.601708769798 |
|  434   |   0   | 0.711120069027 |
|  435   |   0   | 0.601708769798 |
|  436   |   0   | 0.601708769798 |
|  437   |   0   | 0.601708769798 |
|  438   |   0   | 0.642663896084 |
|  439   |   0   | 0.601708769798 |
|  440   |   0   | 0.601708769798 |
|  441   |   0   | 0.601708769798 |
|  442   |   0   | 0.601708769798 |
|  443   |   0   | 0.601708769798 |
|  444   |   0   | 0.601708769798 |
|  445   |   0   | 0.601708769798 |
|  446   |   0   | 0.601708769798 |
|  447   |   0   | 0.601708769798 |
|  448   |   0   | 0.601708769798 |
|  449   |   0   | 0.706617474556 |
|  450   |   0   | 0.601708769798 |
|  451   |   0   | 0.601708471775 |
|  452   |   0   | 0.601708769798 |
|  453   |   0   | 0.601708769798 |
|  454   |   0   | 0.601708769798 |
|  455   |   0   | 0.601708769798 |
|  456   |   0   | 0.601708769798 |
|  457   |   0   | 0.610571682453 |
|  458   |   0   | 0.601708769798 |
|  459   |   0   | 0.601708769798 |
|  460   |   0   | 0.705986857414 |
|  461   |   0   | 0.71090555191  |
|  462   |   0   | 0.601708710194 |
|  463   |   0   | 0.601708769798 |
|  464   |   0   | 0.601708769798 |
|  465   |   0   | 0.601708769798 |
|  466   |   0   | 0.601708769798 |
|  467   |   0   | 0.710813343525 |
|  468   |   0   | 0.601708769798 |
|  469   |   0   | 0.601708769798 |
|  470   |   0   | 0.601711690426 |
|  471   |   0   | 0.601708769798 |
|  472   |   0   | 0.601708769798 |
|  473   |   0   | 0.602107048035 |
|  474   |   0   | 0.711129546165 |
|  475   |   0   | 0.610856354237 |
|  476   |   0   | 0.601708769798 |
|  477   |   0   | 0.601708769798 |
|  478   |   0   | 0.601708769798 |
|  479   |   0   | 0.601708769798 |
|  480   |   0   | 0.607582271099 |
|  481   |   0   | 0.601708769798 |
|  482   |   0   | 0.601708769798 |
|  483   |   0   | 0.601708769798 |
|  484   |   0   | 0.601708590984 |
|  485   |   0   | 0.601708471775 |
|  486   |   0   | 0.709988474846 |
|  487   |   0   | 0.601708769798 |
|  488   |   0   | 0.601708769798 |
|  489   |   0   | 0.601717352867 |
|  490   |   0   | 0.601708769798 |
|  491   |   0   | 0.601708769798 |
|  492   |   0   | 0.601708769798 |
|  493   |   0   | 0.601708769798 |
|  494   |   0   | 0.601708769798 |
|  495   |   0   | 0.601708769798 |
|  496   |   0   | 0.601708769798 |
|  497   |   0   | 0.601708650589 |
|  498   |   0   | 0.601708590984 |
|  499   |   0   | 0.601708769798 |
|  500   |   0   | 0.601708769798 |
|  501   |   0   | 0.601708769798 |
|  502   |   0   | 0.60170686245  |
|  503   |   0   | 0.601708769798 |
|  504   |   0   | 0.601708769798 |
|  505   |   0   | 0.601708769798 |
|  506   |   0   | 0.601708769798 |
|  507   |   0   | 0.601708769798 |
|  508   |   0   | 0.601708769798 |
|  509   |   0   | 0.601708769798 |
|  510   |   0   | 0.601708769798 |
|  511   |   0   | 0.611806094646 |
|  512   |   0   | 0.601708769798 |
|  513   |   0   | 0.601708769798 |
|  514   |   0   | 0.601708769798 |
|  515   |   0   | 0.601708233356 |
|  516   |   0   | 0.601708769798 |
|  517   |   0   | 0.601708769798 |
|  518   |   0   | 0.601708769798 |
|  519   |   0   | 0.601708769798 |
|  520   |   0   | 0.601708769798 |
|  521   |   0   | 0.601708769798 |
|  522   |   0   | 0.601708769798 |
|  523   |   0   | 0.601708769798 |
|  524   |   0   | 0.601708769798 |
|  525   |   0   | 0.601708710194 |
|  526   |   0   | 0.601708769798 |
|  527   |   0   | 0.709474563599 |
|  528   |   0   | 0.601708769798 |
|  529   |   0   | 0.601708769798 |
|  530   |   0   | 0.601708769798 |
|  531   |   0   | 0.601708710194 |
|  532   |   0   | 0.601708769798 |
|  533   |   0   | 0.601708769798 |
|  534   |   0   | 0.601708769798 |
|  535   |   0   | 0.602038681507 |
|  536   |   0   | 0.601708769798 |
|  537   |   0   | 0.601708590984 |
|  538   |   0   | 0.601708710194 |
|  539   |   0   | 0.601708769798 |
|  540   |   0   | 0.711128473282 |
|  541   |   0   | 0.601709008217 |
|  542   |   0   | 0.601708769798 |
|  543   |   0   | 0.601765036583 |
|  544   |   0   | 0.601708769798 |
|  545   |   0   | 0.601708769798 |
|  546   |   0   | 0.601708769798 |
|  547   |   0   | 0.601708769798 |
|  548   |   0   | 0.601708769798 |
|  549   |   0   | 0.601708769798 |
|  550   |   0   | 0.601708769798 |
|  551   |   0   | 0.601775765419 |
|  552   |   0   | 0.601708769798 |
|  553   |   0   | 0.601708710194 |
|  554   |   0   | 0.601708769798 |
|  555   |   0   | 0.601753652096 |
|  556   |   0   | 0.601708769798 |
|  557   |   0   | 0.601708710194 |
|  558   |   0   | 0.601708769798 |
|  559   |   0   | 0.601708769798 |
|  560   |   0   | 0.601708769798 |
|  561   |   0   | 0.601708769798 |
|  562   |   0   | 0.601708769798 |
|  563   |   0   | 0.601708769798 |
|  564   |   0   | 0.601708710194 |
|  565   |   0   | 0.601708769798 |
|  566   |   0   | 0.601708769798 |
|  567   |   0   | 0.711067378521 |
|  568   |   0   | 0.601708769798 |
|  569   |   0   | 0.601708769798 |
|  570   |   0   | 0.601708769798 |
|  571   |   0   | 0.601708769798 |
|  572   |   0   | 0.601708769798 |
|  573   |   0   | 0.601708769798 |
|  574   |   0   | 0.71113806963  |
|  575   |   0   | 0.601708769798 |
|  576   |   0   | 0.798796117306 |
|  577   |   0   | 0.601708710194 |
|  578   |   0   | 0.601708710194 |
|  579   |   0   | 0.601708769798 |
|  580   |   0   | 0.601708769798 |
|  581   |   0   | 0.601713597775 |
|  582   |   0   | 0.601708769798 |
|  583   |   0   | 0.601708769798 |
|  584   |   0   | 0.711003243923 |
|  585   |   0   | 0.601708769798 |
|  586   |   0   | 0.601708769798 |
|  587   |   0   | 0.601708769798 |
|  588   |   0   | 0.601708769798 |
|  589   |   0   | 0.711139440536 |
|  590   |   0   | 0.711128413677 |
|  591   |   0   | 0.601708769798 |
|  592   |   0   | 0.601708769798 |
|  593   |   0   | 0.703078985214 |
|  594   |   0   | 0.601708769798 |
|  595   |   0   | 0.601708769798 |
|  596   |   0   | 0.601708769798 |
|  597   |   0   | 0.601708769798 |
|  598   |   0   | 0.601708769798 |
|  599   |   0   | 0.601708769798 |
|  600   |   0   | 0.601708769798 |
|  601   |   0   | 0.601708769798 |
|  602   |   0   | 0.601708769798 |
|  603   |   0   | 0.601708769798 |
|  604   |   0   | 0.601708769798 |
|  605   |   0   | 0.605908513069 |
|  606   |   0   | 0.601708769798 |
|  607   |   0   | 0.601707935333 |
|  608   |   0   | 0.601708769798 |
|  609   |   0   | 0.601708769798 |
|  610   |   0   | 0.601708769798 |
|  611   |   0   | 0.711041212082 |
|  612   |   0   | 0.601708769798 |
|  613   |   0   | 0.711107552052 |
|  614   |   0   | 0.601708769798 |
|  615   |   0   | 0.601708471775 |
|  616   |   0   | 0.601708769798 |
|  617   |   0   | 0.601708769798 |
|  618   |   0   | 0.601708769798 |
|  619   |   0   | 0.601708769798 |
|  620   |   0   | 0.710059523582 |
|  621   |   0   | 0.601708769798 |
|  622   |   0   | 0.601707935333 |
|  623   |   0   | 0.710216641426 |
|  624   |   0   | 0.696136534214 |
|  625   |   0   | 0.601708769798 |
|  626   |   0   | 0.601708769798 |
|  627   |   0   | 0.601707756519 |
|  628   |   0   | 0.601707756519 |
|  629   |   0   | 0.711080670357 |
|  630   |   0   | 0.601708769798 |
|  631   |   0   | 0.601708769798 |
|  632   |   0   | 0.601708650589 |
|  633   |   0   | 0.601708769798 |
|  634   |   0   | 0.709804236889 |
|  635   |   0   | 0.601708769798 |
|  636   |   0   | 0.601708769798 |
|  637   |   0   | 0.601708650589 |
|  638   |   0   | 0.605356097221 |
|  639   |   0   | 0.601706147194 |
|  640   |   0   | 0.601708769798 |
|  641   |   0   | 0.601708769798 |
|  642   |   0   | 0.601708769798 |
|  643   |   0   | 0.601708769798 |
|  644   |   0   | 0.601708769798 |
|  645   |   0   | 0.601708769798 |
|  646   |   0   | 0.601708769798 |
|  647   |   0   | 0.601708769798 |
|  648   |   0   | 0.601708769798 |
|  649   |   0   | 0.601708769798 |
|  650   |   0   | 0.601708769798 |
|  651   |   0   | 0.601708769798 |
|  652   |   0   | 0.601708710194 |
|  653   |   0   | 0.601708769798 |
|  654   |   0   | 0.601708769798 |
|  655   |   0   | 0.601708769798 |
|  656   |   0   | 0.601708769798 |
|  657   |   0   | 0.601708769798 |
|  658   |   0   | 0.601807355881 |
|  659   |   0   | 0.601708769798 |
|  660   |   0   | 0.601708769798 |
|  661   |   0   | 0.601708769798 |
|  662   |   0   | 0.601708769798 |
|  663   |   0   | 0.601708769798 |
|  664   |   0   | 0.601708710194 |
|  665   |   0   | 0.601708769798 |
|  666   |   0   | 0.601708769798 |
|  667   |   0   | 0.601708769798 |
|  668   |   0   | 0.601708769798 |
|  669   |   0   | 0.601708769798 |
|  670   |   0   | 0.68138551712  |
|  671   |   0   | 0.711133897305 |
|  672   |   0   | 0.601708769798 |
|  673   |   0   | 0.601708769798 |
|  674   |   0   | 0.697129487991 |
|  675   |   0   | 0.601708769798 |
|  676   |   0   |  0.6868237257  |
|  677   |   0   | 0.708710670471 |
|  678   |   0   | 0.601708769798 |
|  679   |   0   | 0.601708769798 |
|  680   |   0   | 0.601708769798 |
|  681   |   0   | 0.601708769798 |
|  682   |   0   | 0.601708769798 |
|  683   |   0   | 0.601708769798 |
|  684   |   0   | 0.601708769798 |
|  685   |   0   | 0.601708769798 |
|  686   |   0   | 0.622830986977 |
|  687   |   0   | 0.711140036583 |
|  688   |   0   | 0.60170841217  |
|  689   |   0   | 0.601708769798 |
|  690   |   0   | 0.603902935982 |
|  691   |   0   | 0.601708769798 |
|  692   |   0   | 0.601708769798 |
|  693   |   0   | 0.601708769798 |
|  694   |   0   | 0.601708710194 |
|  695   |   0   | 0.601708769798 |
|  696   |   0   | 0.601708769798 |
|  697   |   0   | 0.601708769798 |
|  698   |   0   | 0.617759943008 |
|  699   |   0   | 0.601708769798 |
|  700   |   0   | 0.601708769798 |
|  701   |   0   | 0.601708769798 |
|  702   |   0   | 0.601708769798 |
|  703   |   0   | 0.601708769798 |
|  704   |   0   | 0.601708769798 |
|  705   |   0   | 0.601708710194 |
|  706   |   0   | 0.702386021614 |
|  707   |   0   | 0.71113461256  |
|  708   |   0   | 0.601708769798 |
|  709   |   0   | 0.601708769798 |
|  710   |   0   | 0.601708769798 |
|  711   |   0   | 0.601708590984 |
|  712   |   0   | 0.710793733597 |
|  713   |   0   | 0.601708769798 |
|  714   |   0   | 0.601708769798 |
|  715   |   0   | 0.678676307201 |
|  716   |   0   | 0.601708769798 |
|  717   |   0   | 0.601708769798 |
|  718   |   0   | 0.601708769798 |
|  719   |   0   | 0.601708769798 |
|  720   |   0   | 0.601708769798 |
|  721   |   0   | 0.601708769798 |
|  722   |   0   | 0.601708710194 |
|  723   |   0   | 0.601708769798 |
|  724   |   0   | 0.602865934372 |
|  725   |   0   | 0.601708769798 |
|  726   |   0   | 0.601708769798 |
|  727   |   0   | 0.601708769798 |
|  728   |   0   | 0.601723909378 |
|  729   |   0   | 0.601708769798 |
|  730   |   0   | 0.601708769798 |
|  731   |   0   | 0.601708769798 |
|  732   |   0   | 0.601708769798 |
|  733   |   0   | 0.601708710194 |
|  734   |   0   | 0.709831953049 |
|  735   |   0   | 0.601708769798 |
|  736   |   0   | 0.601708769798 |
|  737   |   0   | 0.601708769798 |
|  738   |   0   | 0.601716935635 |
|  739   |   0   | 0.601708769798 |
|  740   |   0   | 0.601708114147 |
|  741   |   0   | 0.601708769798 |
|  742   |   0   | 0.601708769798 |
|  743   |   0   | 0.601708769798 |
|  744   |   0   | 0.601707577705 |
|  745   |   0   | 0.601708769798 |
|  746   |   0   | 0.601708769798 |
|  747   |   0   | 0.601708650589 |
|  748   |   0   | 0.601708769798 |
|  749   |   0   | 0.601708769798 |
|  750   |   0   | 0.601708769798 |
|  751   |   0   | 0.601708769798 |
|  752   |   0   | 0.601708769798 |
|  753   |   0   | 0.601708769798 |
|  754   |   0   | 0.601708769798 |
|  755   |   0   | 0.601708769798 |
|  756   |   0   | 0.601708769798 |
|  757   |   0   | 0.601708769798 |
|  758   |   0   | 0.617472469807 |
|  759   |   0   | 0.601708769798 |
|  760   |   0   | 0.601708769798 |
|  761   |   0   | 0.601708769798 |
|  762   |   0   | 0.601708650589 |
|  763   |   0   | 0.601708769798 |
|  764   |   0   | 0.601708710194 |
|  765   |   0   | 0.601708769798 |
|  766   |   0   | 0.601708769798 |
|  767   |   0   | 0.601708769798 |
|  768   |   0   | 0.601708710194 |
|  769   |   0   | 0.601708769798 |
|  770   |   0   | 0.601708710194 |
|  771   |   0   | 0.601708769798 |
|  772   |   0   | 0.601708710194 |
|  773   |   0   | 0.601708769798 |
|  774   |   0   | 0.601708769798 |
|  775   |   0   | 0.601708769798 |
|  776   |   0   | 0.601708769798 |
|  777   |   0   | 0.604103326797 |
|  778   |   0   | 0.601708769798 |
|  779   |   0   | 0.601708769798 |
|  780   |   0   | 0.601708769798 |
|  781   |   0   | 0.601708769798 |
|  782   |   0   | 0.694732546806 |
|  783   |   0   | 0.711144328117 |
|  784   |   0   | 0.601708769798 |
|  785   |   0   | 0.601708769798 |
|  786   |   0   | 0.601708769798 |
|  787   |   0   | 0.601708769798 |
|  788   |   0   | 0.601708769798 |
|  789   |   0   | 0.601708769798 |
|  790   |   0   | 0.601708769798 |
|  791   |   0   | 0.601708769798 |
|  792   |   0   | 0.601708769798 |
|  793   |   0   | 0.601708769798 |
|  794   |   0   | 0.601708710194 |
|  795   |   0   | 0.601708769798 |
|  796   |   0   | 0.601708769798 |
|  797   |   0   | 0.601708590984 |
|  798   |   0   | 0.710192680359 |
|  799   |   0   | 0.601708769798 |
|  800   |   0   | 0.601708769798 |
|  801   |   0   | 0.601708769798 |
|  802   |   0   | 0.601708769798 |
|  803   |   0   | 0.601708769798 |
|  804   |   0   | 0.601708769798 |
|  805   |   0   | 0.601708769798 |
|  806   |   0   | 0.601708769798 |
|  807   |   0   | 0.601708710194 |
|  808   |   0   | 0.602081179619 |
|  809   |   0   | 0.601708769798 |
|  810   |   0   | 0.601708769798 |
|  811   |   0   | 0.601708769798 |
|  812   |   0   | 0.601708650589 |
|  813   |   0   | 0.601708769798 |
|  814   |   0   | 0.601708769798 |
|  815   |   0   | 0.601708650589 |
|  816   |   0   | 0.601708769798 |
|  817   |   0   | 0.601708769798 |
|  818   |   0   | 0.601708769798 |
|  819   |   0   | 0.601708769798 |
|  820   |   0   | 0.601708769798 |
|  821   |   0   | 0.601723313332 |
|  822   |   0   | 0.601708769798 |
|  823   |   0   | 0.601708769798 |
|  824   |   0   | 0.601708769798 |
|  825   |   0   | 0.601708769798 |
|  826   |   0   | 0.708353102207 |
|  827   |   0   | 0.601708769798 |
|  828   |   0   | 0.601708769798 |
|  829   |   0   | 0.601707518101 |
|  830   |   0   | 0.711142897606 |
|  831   |   0   | 0.601708769798 |
|  832   |   0   | 0.601708769798 |
|  833   |   0   | 0.601708769798 |
|  834   |   0   | 0.601708769798 |
|  835   |   0   | 0.601708769798 |
|  836   |   0   | 0.601708769798 |
|  837   |   0   | 0.601708769798 |
|  838   |   0   | 0.601708471775 |
|  839   |   0   | 0.601708769798 |
|  840   |   0   | 0.601708769798 |
|  841   |   0   | 0.601709365845 |
|  842   |   0   | 0.601708590984 |
|  843   |   0   | 0.601708769798 |
|  844   |   0   | 0.601708769798 |
|  845   |   0   | 0.601706922054 |
|  846   |   0   | 0.601708769798 |
|  847   |   0   | 0.601708769798 |
|  848   |   0   | 0.601708769798 |
|  849   |   0   | 0.601708769798 |
|  850   |   0   | 0.601708769798 |
|  851   |   0   | 0.601708769798 |
|  852   |   0   | 0.633008480072 |
|  853   |   0   | 0.601708769798 |
|  854   |   0   | 0.601708769798 |
|  855   |   0   | 0.709636151791 |
|  856   |   0   | 0.601708769798 |
|  857   |   0   | 0.601708710194 |
|  858   |   0   | 0.601708769798 |
|  859   |   0   | 0.601708769798 |
|  860   |   0   | 0.601708769798 |
|  861   |   0   | 0.601708769798 |
|  862   |   0   | 0.679934322834 |
|  863   |   0   | 0.601708769798 |
|  864   |   0   | 0.601708769798 |
|  865   |   0   | 0.601708769798 |
|  866   |   0   | 0.601708769798 |
|  867   |   0   | 0.601707696915 |
|  868   |   0   | 0.601708769798 |
|  869   |   0   | 0.601708769798 |
|  870   |   0   | 0.601708769798 |
|  871   |   0   | 0.601708769798 |
|  872   |   0   | 0.601708769798 |
|  873   |   0   | 0.601708769798 |
|  874   |   0   | 0.601708769798 |
|  875   |   0   | 0.604016840458 |
|  876   |   0   | 0.601708769798 |
|  877   |   0   | 0.601708769798 |
|  878   |   0   | 0.601708769798 |
|  879   |   0   | 0.601708769798 |
|  880   |   0   | 0.601708769798 |
|  881   |   0   | 0.601708769798 |
|  882   |   0   | 0.601708769798 |
|  883   |   0   | 0.601708769798 |
|  884   |   0   | 0.601708769798 |
|  885   |   0   | 0.601708769798 |
|  886   |   0   | 0.601708769798 |
|  887   |   0   | 0.601724386215 |
|  888   |   0   | 0.601708769798 |
|  889   |   0   | 0.683948278427 |
|  890   |   0   | 0.686399102211 |
|  891   |   0   | 0.711137592793 |
|  892   |   0   | 0.601708769798 |
|  893   |   0   | 0.601708769798 |
|  894   |   0   | 0.601708769798 |
|  895   |   0   | 0.601708710194 |
|  896   |   0   | 0.601708769798 |
|  897   |   0   | 0.601708769798 |
|  898   |   0   | 0.601707935333 |
|  899   |   0   | 0.601709783077 |
|  900   |   0   | 0.601708769798 |
|  901   |   0   | 0.601708769798 |
|  902   |   0   | 0.601708769798 |
|  903   |   0   | 0.601708769798 |
|  904   |   0   | 0.689202666283 |
|  905   |   0   | 0.601708769798 |
|  906   |   0   | 0.601708769798 |
|  907   |   0   | 0.601709187031 |
|  908   |   0   | 0.711139559746 |
|  909   |   0   | 0.601708769798 |
|  910   |   0   | 0.601708769798 |
|  911   |   0   | 0.601708769798 |
|  912   |   0   | 0.601708769798 |
|  913   |   0   | 0.601708769798 |
|  914   |   0   | 0.601708769798 |
|  915   |   0   | 0.601708769798 |
|  916   |   0   | 0.711101830006 |
|  917   |   0   | 0.601708769798 |
|  918   |   0   | 0.601708769798 |
|  919   |   0   | 0.601708769798 |
|  920   |   0   | 0.601708769798 |
|  921   |   0   | 0.601708769798 |
|  922   |   0   | 0.601708769798 |
|  923   |   0   | 0.601708769798 |
|  924   |   0   | 0.601708769798 |
|  925   |   0   | 0.601708590984 |
|  926   |   0   | 0.601708769798 |
|  927   |   0   | 0.601708769798 |
|  928   |   0   | 0.601708710194 |
|  929   |   0   | 0.601708769798 |
|  930   |   0   | 0.601708769798 |
|  931   |   0   | 0.601708590984 |
|  932   |   0   | 0.602406144142 |
|  933   |   0   | 0.601708769798 |
|  934   |   0   | 0.601708769798 |
|  935   |   0   | 0.71094506979  |
|  936   |   0   | 0.601708114147 |
|  937   |   0   | 0.601708769798 |
|  938   |   0   | 0.601708769798 |
|  939   |   0   | 0.601708769798 |
|  940   |   0   | 0.601708769798 |
|  941   |   0   | 0.601837575436 |
|  942   |   0   | 0.601708769798 |
|  943   |   0   | 0.601708769798 |
|  944   |   0   | 0.601708769798 |
|  945   |   0   | 0.601708769798 |
|  946   |   0   | 0.601708769798 |
|  947   |   0   | 0.610161483288 |
|  948   |   0   | 0.71114307642  |
|  949   |   0   | 0.601708710194 |
|  950   |   0   | 0.601708769798 |
|  951   |   0   | 0.601708769798 |
|  952   |   0   | 0.601708710194 |
|  953   |   0   | 0.601708769798 |
|  954   |   0   | 0.601708769798 |
|  955   |   0   | 0.601708769798 |
|  956   |   0   | 0.601708769798 |
|  957   |   0   | 0.601708710194 |
|  958   |   0   | 0.711146891117 |
|  959   |   0   | 0.601708769798 |
|  960   |   0   | 0.601708769798 |
|  961   |   0   | 0.601708769798 |
|  962   |   0   | 0.601708769798 |
|  963   |   0   | 0.601708769798 |
|  964   |   0   | 0.601708769798 |
|  965   |   0   | 0.601708769798 |
|  966   |   0   | 0.601708769798 |
|  967   |   0   | 0.601708769798 |
|  968   |   0   | 0.601708769798 |
|  969   |   0   | 0.67612016201  |
|  970   |   0   | 0.601708769798 |
|  971   |   0   | 0.601708769798 |
|  972   |   0   | 0.601708769798 |
|  973   |   0   | 0.601708769798 |
|  974   |   0   | 0.601708769798 |
|  975   |   0   | 0.601708769798 |
|  976   |   0   | 0.601708769798 |
|  977   |   0   | 0.601708769798 |
|  978   |   0   | 0.601708769798 |
|  979   |   0   | 0.601708769798 |
|  980   |   0   | 0.601708710194 |
|  981   |   0   | 0.601708769798 |
|  982   |   0   | 0.601708769798 |
|  983   |   0   | 0.601708769798 |
|  984   |   0   | 0.601708769798 |
|  985   |   0   | 0.601708769798 |
|  986   |   0   | 0.601708769798 |
|  987   |   0   | 0.710861384869 |
|  988   |   0   | 0.601708710194 |
|  989   |   0   | 0.601708769798 |
|  990   |   0   | 0.711148202419 |
|  991   |   0   | 0.601708769798 |
|  992   |   0   | 0.601708769798 |
|  993   |   0   | 0.601708769798 |
|  994   |   0   | 0.601707756519 |
|  995   |   0   | 0.601708769798 |
|  996   |   0   | 0.601708769798 |
|  997   |   0   | 0.601708769798 |
|  998   |   0   | 0.601708769798 |
|  999   |   0   | 0.601708769798 |
+--------+-------+----------------+
[100000 rows x 3 columns]

sf_valpred = nn.evaluate(sf_subval)
sf_valpred

{'accuracy': 0.6335600018501282, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 2
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        | 31678 |
 |      1       |        0        | 18322 |
 +--------------+-----------------+-------+
 [2 rows x 3 columns]}

nn.show()

Canvas is updated and available in a tab in the default browser.

Logistic regression¶

sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]

logistic = graphlab.logistic_classifier.create(sf_subtrain, 'X15', l2_penalty=0.5)
trainpred_log = logistic.classify(sf_subtrain)
valpred_log = logistic.classify(sf_subval)
sf_traineval_log = logistic.evaluate(sf_subtrain)
sf_valeval_log = logistic.evaluate(sf_subval)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Logistic regression:

--------------------------------------------------------

Number of examples          : 142579

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

Number of coefficients    : 15

Starting Newton Method

--------------------------------------------------------

print sf_traineval, sf_valeval

{'f1_score': 0.4371414825506088, 'auc': 0.7008313081228902, 'recall': 0.32172428820453225, 'precision': 0.6816975106767728, 'log_loss': 0.5967707929514143, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 55072 | 94928 |
|   1e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   2e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   3e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   4e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   5e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   6e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   7e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   8e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   9e-05   | 1.0 | 1.0 | 55072 | 94928 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 86655 |
|      0       |        1        |  8273 |
|      1       |        0        | 37354 |
|      1       |        1        | 17718 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69582} {'f1_score': 0.43383947939262474, 'auc': 0.6957934954250963, 'recall': 0.3210534907765141, 'precision': 0.6687825889820902, 'log_loss': 0.5997670642227892, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 1001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18377 | 31623 |
|   0.001   | 1.0 | 1.0 | 18377 | 31623 |
|   0.002   | 1.0 | 1.0 | 18377 | 31623 |
|   0.003   | 1.0 | 1.0 | 18377 | 31623 |
|   0.004   | 1.0 | 1.0 | 18377 | 31623 |
|   0.005   | 1.0 | 1.0 | 18377 | 31623 |
|   0.006   | 1.0 | 1.0 | 18377 | 31623 |
|   0.007   | 1.0 | 1.0 | 18377 | 31623 |
|   0.008   | 1.0 | 1.0 | 18377 | 31623 |
|   0.009   | 1.0 | 1.0 | 18377 | 31623 |
+-----------+-----+-----+-------+-------+
[1001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2922 |
|      1       |        0        | 12477 |
|      1       |        1        |  5900 |
|      0       |        0        | 28701 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69202}

print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_log['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_log['class'])

0.579818364572
0.579819529846

logistic.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

testpred = logistic.classify(sf_xtest)
testpred.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred_log
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

check = graphlab.SArray.to_numpy(temp['prob2'])
check_log = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_log = check_train

Random Forests¶

rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
trainpred = rf.classify(sf_subtrain)
valpred = rf.classify(sf_subval)
sf_traineval = rf.evaluate(sf_subtrain)
sf_valeval = rf.evaluate(sf_subval)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Random forest classifier:

--------------------------------------------------------

Number of examples          : 94876

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

print sf_traineval, sf_valeval

{'f1_score': 0.43560837452101964, 'auc': 0.7017076019467078, 'recall': 0.32016326530612244, 'precision': 0.681257599444155, 'log_loss': 0.5966123630493014, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 36750 | 63250 |
|   1e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   2e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   3e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   4e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   5e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   6e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   7e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   8e-05   | 1.0 | 1.0 | 36750 | 63250 |
|   9e-05   | 1.0 | 1.0 | 36750 | 63250 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 57745 |
|      0       |        1        |  5505 |
|      1       |        1        | 11766 |
|      1       |        0        | 24984 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69511} {'f1_score': 0.4292321924144311, 'auc': 0.6942995940781368, 'recall': 0.31655932758432487, 'precision': 0.6664368608525796, 'log_loss': 0.5998504683475016, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18322 | 31678 |
|   1e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   2e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   3e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   4e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   5e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   6e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   7e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   8e-05   | 1.0 | 1.0 | 18322 | 31678 |
|   9e-05   | 1.0 | 1.0 | 18322 | 31678 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2903 |
|      0       |        0        | 28775 |
|      1       |        0        | 12522 |
|      1       |        1        |  5800 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.6915}

print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])

0.602631049447
0.59795433189

rf.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

testpred = rf.classify(sf_xtest)

testpred.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

check = graphlab.SArray.to_numpy(temp['prob2'])
check_rf = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_rf = check_train

# 0.65124 Kaggle - Vanilla Random Forests
# rf = graphlab.random_forest_classifier.create(sf_subtrain, 'X15')
np.savetxt('Yhat_rfgraphlab_test1.txt', np.vstack( (np.arange(len(check)) , 
                                          check) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

Random forests experimentation¶

sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]

#featlist = ['X1', 'X14', 'X9', 'X3', 'X13', 'X4']
rf2 = graphlab.random_forest_classifier.create(sf_subtrain, 'X15', 
                                               max_iterations=30, min_child_weight = 10, validation_set=sf_subval
                                              , random_seed = 0)
trainpred = rf2.classify(sf_subtrain)
valpred = rf2.classify(sf_subval)
sf_traineval = rf2.evaluate(sf_subtrain)
sf_valeval = rf2.evaluate(sf_subval)

Random forest classifier:

--------------------------------------------------------

Number of examples          : 150000

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

| 1         | 0.230538     | 0.683740          | 0.682520            | 0.608511          | 0.610511            |

print sf_traineval, sf_valeval

{'f1_score': 0.4371414825506088, 'auc': 0.7008313081228902, 'recall': 0.32172428820453225, 'precision': 0.6816975106767728, 'log_loss': 0.5967707929514143, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 55072 | 94928 |
|   1e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   2e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   3e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   4e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   5e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   6e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   7e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   8e-05   | 1.0 | 1.0 | 55072 | 94928 |
|   9e-05   | 1.0 | 1.0 | 55072 | 94928 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        0        | 86655 |
|      0       |        1        |  8273 |
|      1       |        0        | 37354 |
|      1       |        1        | 17718 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69582} {'f1_score': 0.43383947939262474, 'auc': 0.6957934954250963, 'recall': 0.3210534907765141, 'precision': 0.6687825889820902, 'log_loss': 0.5997670642227892, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+-------+-------+
| threshold | fpr | tpr |   p   |   n   |
+-----------+-----+-----+-------+-------+
|    0.0    | 1.0 | 1.0 | 18377 | 31623 |
|   1e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   2e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   3e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   4e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   5e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   6e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   7e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   8e-05   | 1.0 | 1.0 | 18377 | 31623 |
|   9e-05   | 1.0 | 1.0 | 18377 | 31623 |
+-----------+-----+-----+-------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  2922 |
|      1       |        0        | 12477 |
|      1       |        1        |  5900 |
|      0       |        0        | 28701 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.69202}

print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred['class'])

0.603267835593
0.599493347228

rf2.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

Appending logistic and rf predictions together¶

#print valpred_log, valpred
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])
temp = np.column_stack((logclass, rfclass, valclass))
#bothclass = np.ndarray((len(logclass, )))

log_vs_val = (temp[:, 0] == temp[:, 2]).astype(int)
rf_vs_val = (temp[:, 1] == temp[:, 2]).astype(int)
temp2 = np.column_stack((temp, log_vs_val, rf_vs_val))
print temp2.shape
print temp2
len(np.where(log_vs_val==rf_vs_val)[0])

(50000, 5)
[[0 0 0 1 1]
 [0 0 0 1 1]
 [0 0 0 1 1]
 ..., 
 [0 0 1 0 0]
 [0 0 0 1 1]
 [0 0 1 0 0]]

45796

final_val = np.maximum(logclass, rfclass)
print final_val
print "Validation error: ", float(np.sum(rfclass != valclass))/len(valclass)

[0 0 0 ..., 0 0 0]
Validation error:  0.30798

Unique rows in the data¶

np.vstack({tuple(row) for row in X_train})

array([[ 246.    ,  235.    ,  244.66  , ...,    2.5391,   20.    ,    0.    ],
       [ 252.    ,  232.    ,  247.01  , ...,    3.2324,   20.    ,    0.    ],
       [ 239.    ,  232.    ,  242.61  , ...,    3.0288,   11.449 ,    0.    ],
       ..., 
       [ 253.    ,  235.    ,  243.54  , ...,    2.0576,   20.    ,    0.    ],
       [ 241.66  ,  226.    ,  242.27  , ...,    1.3927,    4.9498,    0.    ],
       [ 241.    ,  236.    ,  245.49  , ...,    1.5416,   20.    ,   38.8   ]])

AdaBoost¶

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

dt_xtr, dt_xtest, dt_ytr, dt_ytest = ml.splitData(X_train, Y_train, 0.75)

params={'criterion' : 'entropy',
        'max_depth' : 7,
       #'min_samples_split' : 350,
       #'min_samples_leaf': 50,
       'class_weight':'balanced'}

bdt = AdaBoostClassifier(DecisionTreeClassifier(**params),
                         algorithm="SAMME",
                         n_estimators=150)

bdt.fit(dt_xtr, dt_ytr)
print bdt.score(dt_xtest,dt_ytest)
fpr = dict()
tpr = dict()
roc_auc = dict()
ypred2=bdt.predict_proba(dt_xtest)

0.6598

fpr, tpr, _ = roc_curve(dt_ytest, ypred2[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr,tpr,'b-')
print roc_auc

0.703400132221

SVM - not useful¶

sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]

svm = graphlab.svm_classifier.create(sf_subtrain, 'X15', max_iterations=70, class_weights={0:1, 1:1.5}, 
                                     validation_set=sf_subval, convergence_threshold = 0.001)
trainpred_svm = svm.classify(sf_subtrain)
valpred_svm = svm.classify(sf_subval)
traineval_svm = svm.evaluate(sf_subtrain)
valeval_svm = svm.evaluate(sf_subval)

SVM:

--------------------------------------------------------

Number of examples          : 150000

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

Number of coefficients    : 15

Starting L-BFGS

--------------------------------------------------------

+-----------+----------+-----------+--------------+-------------------+---------------------+

svm.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_svm['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_svm['class'])

0.56565434265
0.565826081144

testpred = svm.classify(sf_xtest)
#temp = testpred
#temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

testpred

Boosted trees¶

bt = graphlab.boosted_trees_classifier.create(sf_subtrain, 'X15', max_depth=8, validation_set=sf_subval, 
                                              row_subsample=0.85, column_subsample = 0.6, random_seed=1)
trainpred_bt = bt.classify(sf_subtrain)
valpred_bt = bt.classify(sf_subval)
traineval_bt = bt.evaluate(sf_subtrain)
valeval_bt = bt.evaluate(sf_subval)

Boosted trees classifier:

--------------------------------------------------------

Number of examples          : 150000

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

| Iteration | Elapsed Time | Training-accuracy | Validation-accuracy | Training-log_loss | Validation-log_loss |

+-----------+--------------+-------------------+---------------------+-------------------+---------------------+

| 1         | 0.242876     | 0.686453          | 0.678780            | 0.650155          | 0.652594            |

print traineval_bt, valeval_bt

{'f1_score': 0.5914221218961625, 'auc': 0.8290722367030955, 'recall': 0.4704419889502762, 'precision': 0.7961664329125759, 'log_loss': 0.505644897670549, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 100001

Data:
+-----------+-----+-----+------+-------+
| threshold | fpr | tpr |  p   |   n   |
+-----------+-----+-----+------+-------+
|    0.0    | 1.0 | 1.0 | 7240 | 12760 |
|   1e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   2e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   3e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   4e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   5e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   6e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   7e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   8e-05   | 1.0 | 1.0 | 7240 | 12760 |
|   9e-05   | 1.0 | 1.0 | 7240 | 12760 |
+-----------+-----+-----+------+-------+
[100001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  872  |
|      0       |        0        | 11888 |
|      1       |        1        |  3406 |
|      1       |        0        |  3834 |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.7647} {'f1_score': 0.4734636871508379, 'auc': 0.7151508053023574, 'recall': 0.3689795918367347, 'precision': 0.6604968339016074, 'log_loss': 0.5850111528423768, 'roc_curve': Columns:
	threshold	float
	fpr	float
	tpr	float
	p	int
	n	int

Rows: 1001

Data:
+-----------+-----+-----+------+------+
| threshold | fpr | tpr |  p   |  n   |
+-----------+-----+-----+------+------+
|    0.0    | 1.0 | 1.0 | 3675 | 6325 |
|   0.001   | 1.0 | 1.0 | 3675 | 6325 |
|   0.002   | 1.0 | 1.0 | 3675 | 6325 |
|   0.003   | 1.0 | 1.0 | 3675 | 6325 |
|   0.004   | 1.0 | 1.0 | 3675 | 6325 |
|   0.005   | 1.0 | 1.0 | 3675 | 6325 |
|   0.006   | 1.0 | 1.0 | 3675 | 6325 |
|   0.007   | 1.0 | 1.0 | 3675 | 6325 |
|   0.008   | 1.0 | 1.0 | 3675 | 6325 |
|   0.009   | 1.0 | 1.0 | 3675 | 6325 |
+-----------+-----+-----+------+------+
[1001 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 4

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  1356 |
|      1       |        0        |  2319 |
|      0       |        0        |  5628 |
|      0       |        1        |  697  |
+--------------+-----------------+-------+
[4 rows x 3 columns]
, 'accuracy': 0.6984}

print graphlab.evaluation.auc(sf_subtrain['X15'], trainpred_bt['class'])
print graphlab.evaluation.auc(sf_subval['X15'], valpred_bt['class'])

0.684977051906
0.609060643704

bt.show()

Canvas is accessible via web browser at the URL: http://localhost:59750/index.html
Opening Canvas in default web browser.

testpred = bt.classify(sf_xtest)
temp = testpred
temp['prob2'] = temp[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

temp2 = trainpred_bt
temp2['prob2'] = temp2[['class', 'probability']].apply(lambda row: 1-row['probability'] if row['class'] == 0 else row['probability'])

check = graphlab.SArray.to_numpy(temp['prob2'])
check_bt = check

check_train = graphlab.SArray.to_numpy(temp2['prob2'])
check_train_bt = check_train

#np.savetxt('Yhat_btgraphlab_test1.txt', np.vstack( (np.arange(len(check)) , 
#                                          check) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
np.savetxt('Yhat_btgraphlab_test2.txt', np.vstack( (np.arange(len(check)) , 
                                          check) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# Kaggle score - 0.64205

testpred.show()

Canvas is updated and available in a tab in the default browser.

print "Validation error: ", float(np.sum(graphlab.SArray.to_numpy(valpred_bt['class']) != valclass))/len(valclass)

Validation error:  0.28148

Cross validation¶

sf_subtrain = sf_alltrain[0:150000]
sf_subval = sf_alltrain[150000:200000]

nFolds = 5; # Initialize number of folds to be 5
J = np.empty([ nFolds], dtype=float) 
# Create an empty (M, 5) float array to store the 5-fold validation
# errors from the different models
folds = graphlab.cross_validation.KFold(sf_alltrain, 5)

for train, valid in folds:
    m = graphlab.boosted_trees_classifier.create(train, target='X15', validation_set=None, max_depth=20)
    print m.evaluate(valid)

#J[iFold] = float(np.sum(Yvi_pred != Yvi))/len(Yvi)
#Jmean = np.mean(J) # Overall estimated validation performance for each model
#print "Cross Validation error: \n", Jmean

Boosted trees classifier:

--------------------------------------------------------

Number of examples          : 160000

Number of classes           : 2

Number of feature columns   : 14

Number of unpacked features : 14

+-----------+--------------+-------------------+-------------------+

| Iteration | Elapsed Time | Training-accuracy | Training-log_loss |

+-----------+--------------+-------------------+-------------------+

| 1         | 0.633326     | 0.845638          | 0.576053          |

Final Ensemble - Random forest, SVM, Boosted trees, Log reg¶

#learner = svm (from scipy), svm (graphlab), bt, rf, logistic 
from sklearn.externals import joblib
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Save all the learners
joblib.dump(learner, 'svm.pkl')

['svm.pkl']

svm.save('svm')
bt.save('bt')
rf.save('rf')
logistic.save('logistic')

valpred_bt.shape

(50000, 2)

Ensembling with hard predictions¶

# Only validation data
logclass = graphlab.SArray.to_numpy(valpred_log['class'])
rfclass = graphlab.SArray.to_numpy(valpred['class'])
btclass = graphlab.SArray.to_numpy(valpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(valpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subval['X15'])

#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))

# Only training data
logclass = graphlab.SArray.to_numpy(trainpred_log['class'])
rfclass = graphlab.SArray.to_numpy(trainpred['class'])
btclass = graphlab.SArray.to_numpy(trainpred_bt['class'])
svmclass = graphlab.SArray.to_numpy(trainpred_svm['class'])
valclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])

#temp = np.column_stack((logclass, rfclass, btclass, svmclass))
temp = np.column_stack((logclass, rfclass, btclass))

# gnb = GaussianNB()
gnb = RandomForestClassifier(max_features=None)

# If NB on training data
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 120000, 150000)

# If NB on validation data
#temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, valclass, 35000, 50000)
gnb.fit(temptrain, trainclass)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

print temptrain.shape, temptest.shape, temp.shape, trainclass.shape, testclass.shape

(120000, 3) (30000, 3) (150000, 3) (120000,) (30000,)

gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)

print gnb_train_pred, trainclass

[1 0 1 ..., 0 0 0] [1 0 1 ..., 0 0 0]

print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass), '\n'
print "All error: ", float(np.sum(gnb_all_pred != valclass))/len(valclass), '\n'
print "SVM error: ", float(np.sum(svmclass != valclass))/len(valclass), '\n'
print "RF error: ", float(np.sum(rfclass != valclass))/len(valclass), '\n'
print "Logistic error: ", float(np.sum(logclass != valclass))/len(valclass), '\n'
print "BT error: ", float(np.sum(btclass != valclass))/len(valclass), '\n'
#print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])
#print len(ytr_pred[ytr_pred==1]), len(ytr_pred[ytr_pred==0])

#print len(yval_sub[yval_sub==1]), len(yval_sub[yval_sub==0])
#print len(yval_pred[yval_pred==1]), len(yval_pred[yval_pred==0])

Training error:  0.277766666667
Validation error:  0.277633333333 

All error:  0.27774 

SVM error:  0.34052 

RF error:  0.30418 

Logistic error:  0.3245 

BT error:  0.27774

SVM not adding to the predictions

Counting #1s and 0s in Mean prediction¶

logtrain = graphlab.SArray.to_numpy(trainpred_log['class'])
rftrain = graphlab.SArray.to_numpy(trainpred['class'])
bttrain = graphlab.SArray.to_numpy(trainpred_bt['class'])
trains = np.column_stack((logtrain, rftrain, bttrain))

alltrainpred = np.empty(len(trainpred), dtype=float)

for i in range(len(alltrainpred)):
    if(np.sum(trains[i]) >= 2):
        alltrainpred[i] = 1
    else:
        alltrainpred[i] = 0

alltrainpred

array([ 1.,  0.,  0., ...,  0.,  1.,  0.])

print len(alltrainpred[alltrainpred==1]), len(alltrainpred[alltrainpred==0])
print len(ytr_sub[ytr_sub==1]), len(ytr_sub[ytr_sub==0])

25818 124182
55072 94928

Ensembling with soft predictions¶

temp = np.column_stack((check_train_log, check_train_bt, check_train_rf))

print check_train_bt, trainpred_bt

[ 0.57272857  0.45087078  0.50593823 ...,  0.19499448  0.57286471
  0.43756109] +-------+----------------+----------------+
| class |  probability   |     prob2      |
+-------+----------------+----------------+
|   1   | 0.572728574276 | 0.572728574276 |
|   0   | 0.549129217863 | 0.450870782137 |
|   1   | 0.505938231945 | 0.505938231945 |
|   1   | 0.610947012901 | 0.610947012901 |
|   0   | 0.702189207077 | 0.297810792923 |
|   0   | 0.701577663422 | 0.298422336578 |
|   1   | 0.740846812725 | 0.740846812725 |
|   0   | 0.742470055819 | 0.257529944181 |
|   0   | 0.728553414345 | 0.271446585655 |
|   0   | 0.855606645346 | 0.144393354654 |
+-------+----------------+----------------+
[150000 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

np.random.seed(0)
alltrainclass = graphlab.SArray.to_numpy(sf_subtrain['X15'])
temptrain, temptest, trainclass, testclass = pjkt.data_seq_split(temp, alltrainclass, 100000, 150000)

#gnb = RandomForestClassifier(max_features=None, min_samples_split=50, min_samples_leaf=50, max_depth=3)
gnb = GaussianNB()
gnb.fit(temptrain, trainclass)
gnb_train_pred = gnb.predict(temptrain)
gnb_val_pred = gnb.predict(temptest)
gnb_all_pred = gnb.predict(temp)
print "Training error: ", float(np.sum(gnb_train_pred != trainclass))/len(trainclass)
print "Validation error: ", float(np.sum(gnb_val_pred != testclass))/len(testclass)
print "All error: ", float(np.sum(gnb_all_pred != alltrainclass))/len(alltrainclass), '\n'

Training error:  0.29342
Validation error:  0.29426
All error:  0.2937

#y_score = learner.fit(xtr_sub, ytr_sub).decision_function(xtr_sub)
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(gnb_all_pred, alltrainclass)
print metrics.roc_auc_score(gnb_all_pred, alltrainclass)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr)
#plt.show()

0.687054323492

[<matplotlib.lines.Line2D at 0x1e3b4f9d0>]

finalTest = np.column_stack((check_log, check_bt, check_rf))

finalTest

array([[ 0.26656557,  0.26754108,  0.2886048 ],
       [ 0.29658706,  0.1940293 ,  0.23737937],
       [ 0.31720908,  0.47478011,  0.36365309],
       ..., 
       [ 0.32006905,  0.32459387,  0.34584743],
       [ 0.48061763,  0.47387815,  0.51016921],
       [ 0.45410944,  0.53172541,  0.50336707]])

# Predict on all test data
finalPred = gnb.predict(finalTest)
finalPredSoft = gnb.predict_proba(finalTest)
print len(finalPred[finalPred==1]), len(finalPred[finalPred==0])

39170 160830

meanPredSoft = np.mean(finalTest, axis=1)
print meanPredSoft

[ 0.27423715  0.24266525  0.3852141  ...,  0.33017012  0.48822166
  0.49640064]

finalPredSoft

array([[ 0.80113442,  0.19886558],
       [ 0.88408362,  0.11591638],
       [ 0.51285485,  0.48714515],
       ..., 
       [ 0.69102327,  0.30897673],
       [ 0.51285485,  0.48714515],
       [ 0.39628387,  0.60371613]])

#np.savetxt('Yhat_finalensemble_test1.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
# 0.61041 Kaggle using Random Forests

# Yet to test
#np.savetxt('Yhat_finalensemble_nb_test2.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
#Kaggle score - 0.66042

#np.savetxt('Yhat_finalensemble_rf_test3.txt', np.vstack( (np.arange(len(finalPredSoft)) , 
#                                          finalPredSoft[:, 1]) ).T, 
#           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

#Kaggle score - 0.64273

np.savetxt('Yhat_finalensemble_mean_test4.txt', np.vstack( (np.arange(len(meanPredSoft)) , 
                                          meanPredSoft) ).T, 
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');