The data consists of 48x48 pixel grayscale images of faces. The faces have been automatically registered so that the face is more or less centered and occupies about the same amount of space in each image. The task is to categorize each face based on the emotion shown in the facial expression in to one of seven categories (0=Angry, 1=Disgust, 2=Fear, 3=Happy, 4=Sad, 5=Surprise, 6=Neutral).
train.csv contains two columns, "emotion" and "pixels". The "emotion" column contains a numeric code ranging from 0 to 6, inclusive, for the emotion that is present in the image. The "pixels" column contains a string surrounded in quotes for each image. The contents of this string a space-separated pixel values in row major order. test.csv contains only the "pixels" column and your task is to predict the emotion column.
The training set consists of 28,709 examples. The public test set used for the leaderboard consists of 3,589 examples. The final test set, which was used to determine the winner of the competition, consists of another 3,589 examples.
This dataset was prepared by Pierre-Luc Carrier and Aaron Courville, as part of an ongoing research project. They have graciously provided the workshop organizers with a preliminary version of their dataset to use for this contest.
%matplotlib inline
import graphlab as gl
import numpy as np
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from scipy.stats import mode
from sklearn.decomposition import PCA
trainfile = 'train.txt'
testfile = 'test.txt'
pri_testfile = 'pri_test.txt'
# Read training data and labels into lists
trainlabels = []
traindata = []
with open(trainfile, 'r') as f:
for line in f:
currfile, currlabel = line.split()
trainlabels.append(currlabel)
currfile = plt.imread(currfile)
currdata = np.reshape(currfile, (48*48))
traindata.append(currdata)
# Read public test data and labels into lists
testlabels = []
testdata = []
with open(testfile, 'r') as f:
for line in f:
currfile, currlabel = line.split()
testlabels.append(currlabel)
currfile = plt.imread(currfile)
currdata = np.reshape(currfile, (48*48))
testdata.append(currdata)
# Read private test data and labels into lists
pri_testlabels = []
pri_testdata = []
with open(pri_testfile, 'r') as f:
for line in f:
currfile, currlabel = line.split()
pri_testlabels.append(currlabel)
currfile = plt.imread(currfile)
currdata = np.reshape(currfile, (48*48))
pri_testdata.append(currdata)
# Create training dictionary containing data row wise
finaltrain = {}
for i in range(48*48):
varname = "X"+str(i)
column = []
for j in range(len(traindata)):
column.append(traindata[j][i])
finaltrain[varname] = column
# finaltrain['label'] = trainlabels
# Create public test dictionary containing data row wise
finaltest = {}
for i in range(48*48):
varname = "X"+str(i)
column = []
for j in range(len(testdata)):
column.append(testdata[j][i])
finaltest[varname] = column
# finaltest['label'] = testlabels
# Create private test dictionary containing data row wise
final_pritest = {}
for i in range(48*48):
varname = "X"+str(i)
column = []
for j in range(len(pri_testdata)):
column.append(pri_testdata[j][i])
final_pritest[varname] = column
# finaltest['label'] = testlabels
# Create SFrames (GraphLab's efficient data structure) for each dataset
sf_train = gl.SFrame(finaltrain)
sf_test = gl.SFrame(finaltest)
sf_pri_test = gl.SFrame(final_pritest)
# Save the SFrames
sf_train.save('SFrame/sf_train')
sf_test.save('SFrame/sf_test')
sf_pri_test.save('SFrame/sf_pri_test')
# Load to verify
sf_train = gl.load_sframe('SFrame/sf_train')
sf_test = gl.load_sframe('SFrame/sf_test')
sf_pri_test = gl.load_sframe('SFrame/sf_pri_test')
# Delete objects that are not required
# del traindata, testdata, currfile, currlabel
import os
start = timer()
# Standardize training and test data
# Use training mean and std dev for test data - normalize test data first
# (or else train will get changed)
for c in sf_train.column_names():
sf_test[c] = (sf_test[c] - sf_train[c].mean()) / sf_train[c].std()
sf_pri_test[c] = (sf_pri_test[c] - sf_train[c].mean()) / sf_train[c].std()
sf_train[c] = (sf_train[c] - sf_train[c].mean()) / sf_train[c].std()
end = timer()
print "Time elapsed = ", (end - start), "seconds"
os.system("say 'Your program has finished'")
# Verify means and std devs
print sf_pri_test['X1'].mean()
print sf_test['X1'].mean()
print sf_train['X1'].mean()
print sf_test['X1'].std()
print sf_train['X1'].std()
print sf_pri_test['X1'].std()
sf_train.save('SFrame/sf_train_standardized')
sf_test.save('SFrame/sf_test_standardized')
sf_pri_test.save('SFrame/sf_pri_test_standardized')
print sf_train.shape
print sf_test.shape
print sf_pri_test.shape
start = timer()
sf_train_numpy = sf_train.to_numpy()
sf_test_numpy = sf_test.to_numpy()
sf_pri_test_numpy = sf_pri_test.to_numpy()
print sf_train_numpy.shape # Verify shape
print sf_test_numpy.shape # Verify shape
print sf_pri_test_numpy.shape # Verify shape
end = timer()
print "Time elapsed = ", (end - start), "seconds"
os.system("say 'Your program has finished'")
# Save the numpy versions of the objects
np.save('SFrame/nptrain', sf_train_numpy)
np.save('SFrame/nptest', sf_test_numpy)
np.save('SFrame/np_pritest', sf_pri_test_numpy)
sf_train_numpy = np.load('SFrame/nptrain.npy')
sf_test_numpy = np.load('SFrame/nptest.npy')
sf_pri_test_numpy = np.load('SFrame/np_pritest.npy')
num_comp = 120 # How many components
pca = PCA(n_components = num_comp, svd_solver='full')
pca.fit(sf_train_numpy)
# Percentage of variance in the data explained by the components
print sum(pca.explained_variance_ratio_)
50 components ==> 83.4%
100 components ==> 89%
120 components ==> 90.4%
160 components ==> 92.2%
500 components ==> 97.4%
# Reduce dimensionality of data using the principal components
dimred_train = pca.transform(sf_train_numpy)
dimred_test = pca.transform(sf_test_numpy)
dimred_pri_test = pca.transform(sf_pri_test_numpy)
print dimred_train.shape
print dimred_test.shape
print dimred_pri_test.shape
from sklearn.manifold import TSNE
test = TSNE(2)
test.fit(dimred_test)
plt.scatter(test.embedding_[:, 0], test.embedding_[:, 1], c=testlabels)
# Convert PCA transformed data into SFrames
dimred_sf_test = gl.SFrame(map(gl.SArray, dimred_test.T))
dimred_sf_train = gl.SFrame(map(gl.SArray, dimred_train.T))
dimred_sf_pri_test = gl.SFrame(map(gl.SArray, dimred_pri_test.T))
# Verify shape
print dimred_sf_test.shape, dimred_sf_train.shape, dimred_sf_pri_test.shape
# Create knn model
# For numeric data, the 'distance' function options are euclidean, manhattan, cosine,
# and transformed_dot_product
model = gl.nearest_neighbors.create(dimred_sf_train, distance='cosine')
model.summary()
# Test knn model on a few training data points
knn = model.query(dimred_sf_train[:5], k=5)
knn.head()
# Test knn model on a few test data points
knn = model.query(dimred_sf_test[10:11], k=15, verbose=False)
knn.print_rows(15)
# Predict on entire public and private test data
start = timer()
testpred = []
pri_testpred = []
num_nbr = 11
for i in range(len(dimred_sf_test)):
print i,
# Public test
curr_preds = []
knn = model.query(dimred_sf_test[i:(i+1)], k=num_nbr, verbose=False)
curr_preds = np.array(knn['reference_label'])
mode_preds = mode((np.take(trainlabels, curr_preds)))[0][0]
testpred.append(mode_preds)
# Private test
curr_preds = []
knn = model.query(dimred_sf_pri_test[i:(i+1)], k=num_nbr, verbose=False)
curr_preds = np.array(knn['reference_label'])
mode_preds = mode((np.take(trainlabels, curr_preds)))[0][0]
pri_testpred.append(mode_preds)
end = timer()
print "Time elapsed = ", (end - start), "seconds"
os.system("say 'Your program has finished'")
# Verify shapes
print len(dimred_sf_test), len(testpred), len(testlabels), len(dimred_sf_pri_test), len(pri_testlabels), \
len(pri_testpred)
# Accuracy of predictions
print np.sum(np.array(testpred) == np.array(testlabels))/ float(len(testlabels))
print np.sum(np.array(pri_testpred) == np.array(pri_testlabels))/ float(len(pri_testlabels))
Results
PCA with 100 components; knn with cosine similarity; 5 neighbors; Test accuracy = 36.8626%
PCA with 100 components; knn with cosine similarity; 10 neighbors; Test accuracy = 36.5%
PCA with 500 components; knn with cosine similarity; 5 neighbors; Test accuracy = 36.08%
PCA with 50 components; knn with cosine similarity; 11 neighbors; Test accuracy = 36.47%
PCA with 50 components; knn with euclidean; 7 neighbors; Test accuracy = 34.44%
PCA with 50 components; knn with euclidean; 11 neighbors; Test accuracy = 34.74%
PCA with 50 components; knn with manhattan; 15 neighbors; Test accuracy = 36.388%
PCA with 120 components; knn with cosine similarity; 11 neighbors; Public Test accuracy = 37.14%; Private Test accuracy = 36.75%
PCA with 160 components; knn with cosine similarity; 11 neighbors; Test accuracy = 35.63%
PCA with 160 components; knn with cosine similarity; 5 neighbors; Test accuracy = 36.42%
# Save the predictions
test_37percent_120cos11 = testpred
pritest_37percent_120cos11 = pri_testpred
np.save('SFrame/test_37percent_120cos11', test_37percent_120cos11)
np.save('SFrame/pritest_37percent_120cos11', pritest_37percent_120cos11)