Data dictionary:
BR_INST_EXEC.ALL_BRANCHES: Speculative and retired branches
Cycles (CPU_CLK_UNHALTED.THREAD_P): Thread cycles when thread is not in halt state
ICACHE.MISSES: # instruction cache, victim cache, and streaming buffer misses. Uncacheable accesses included
Instructions (INST_RETIRED.ANY_P): Number of instructions retired
IPC: Instructions/Cycles
ITLB_MISSES.MISS_CAUSES_A_WALK: Misses at all ITLB levels that causes a page walk
CYCLE_ACTIVITY.CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is outstanding
L1D.REPLACEMENT: L1D data line replacements
L2_cache_misses (L2_RQSTS.MISS): All requests that miss L2 cache
L2_cache_accesses (L2_RQSTS.REFERENCES): All L2 requests
MACHINE_CLEARS.COUNT: Number of machine clears (nukes) of any type
MACHINE_CLEARS.CYCLES: Cycles where there was a nuke (thread-specific and all thread)
MEM_LOAD_UOPS_RETIRED.L1_MISS: Retired load uops misses in L1 cache as data sources
MISALIGN_MEM_REF.LOADS: Speculative cache line split load uops dispached to L1 cache
RESOURCE_STALLS.ANY: Resource-related stall cycles
UOPS_EXECUTED.CORE: Number of uops executed on the core
DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK: Load misses in all DTLB levels that cause page walks
UOPS_EXECUTED.THREAD: Counts the number of uops to be executed per thread each cycle
UOPS_ISSUED.ANY: Uops that resource allocation table (RAT) issues to reservation station (RS)
UOPS_ISSUED.STALL_CYCLES: Cycles when RAT does not issue uops to RS for the thread
UOPS_RETIRED.ALL: Actually retired uops
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import minmax_scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.base import clone
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import LeaveOneOut, train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import mutual_info_classif
plt.style.use('ggplot')
np.random.seed(1) # To reproduce results
data = pd.read_pickle('Intermediate/Data_final')
print data.shape
data.head()
data['Vectorizable'].value_counts()
# Target class proportion
66./(66+85)
# Create training and test data splits
n_feats_orig = data.shape[1]-1
seed = 0
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:n_feats_orig], data.iloc[:, n_feats_orig], \
random_state=seed, test_size=50)
X_all, y_all = data.iloc[:, 1:n_feats_orig], data.iloc[:, n_feats_orig]
print X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_all.shape, y_all.shape
feats = X_train.columns.values.tolist()
print y_train.value_counts()
# Get the names of all features
all_feats = X_train.columns.values.tolist()
# Calculate mutual information to filter features
# num_feats = 10
mut_info = mutual_info_classif(X_all, y_all, n_neighbors=10, random_state=seed)
# feats = [all_feats[idx] for idx in np.argsort(-mut_info)]
# feats = feats[:num_feats]
# Getting top p features based on a threshold
feats = [all_feats[idx] for idx in np.where(mut_info >= mut_info.mean())[0]] # Threshold = Mean
X_train = X_train[feats]
X_test = X_test[feats]
X_all = X_all[feats]
print X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_all.shape, y_all.shape
print feats
n_feats = len(feats)
Variables (from 10 neighbor M.I. calculation)
[u'ICACHE.MISSES', u'IPC', u'L1D.REPLACEMENT', u'L2_cache_accesses (L2_RQSTS.REFERENCES)', u'MISALIGN_MEM_REF.LOADS', u'RESOURCE_STALLS.ANY', u'UOPS_EXECUTED.CORE', u'DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK', u'UOPS_EXECUTED.THREAD', u'UOPS_RETIRED.ALL']
# # Manual feature selection (based on visualization) - Unused
# feats = ['ICACHE.MISSES', 'Instructions (INST_RETIRED.ANY_P)', 'ITLB_MISSES.MISS_CAUSES_A_WALK', 'L1D.REPLACEMENT', \
# 'L2_cache_accesses (L2_RQSTS.REFERENCES)', 'RESOURCE_STALLS.ANY', 'UOPS_EXECUTED.CORE', \
# 'UOPS_ISSUED.STALL_CYCLES', 'MACHINE_CLEARS.CYCLES']
# X_train = X_train[feats]
# X_test = X_test[feats]
# X_all = X_all[feats]
# print X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_all.shape, y_all.shape
# n_feats = len(feats)
np.random.seed(1)
rseed = 0
# Uncomment each model to test it individually
# clf = LogisticRegression(class_weight='balanced', C=0.1)
clf = KNeighborsClassifier(n_neighbors=5) #, weights = 'distance')
# clf = LogisticRegression(random_state = rseed)
# clf = GaussianNB()
# clf = GradientBoostingClassifier(random_state = rseed)
# clf = RandomForestClassifier(random_state = rseed)
# clf = SVC(kernel='linear', C=100, random_state=rseed)
print clf
cross_val_scores = cross_val_score(clf, X_all, y_all, cv=20)#, scoring='accuracy')
print "Individual cross val scores =", cross_val_scores
print "Mean cross val score =", np.mean(cross_val_scores)
np.random.seed(1)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print "Pred Accuracy on train = ", accuracy_score(y_train, y_pred_train)
print "Pred Accuracy on test = ", accuracy_score(y_test, y_pred_test)
print classification_report(y_test, y_pred_test)
# Distribution of the target class in the test data
y_test.value_counts()
# View feature importances
# top_feat_idx = np.argsort(-np.abs(clf.coef_)) # For logistic regression/SVM
top_feat_idx = np.argsort(-clf.feature_importances_) # For tree based models (random forests, gradient boosting)
top_feats = [feats[idx] for idx in top_feat_idx]
print top_feats[:3]
# Plot the feature importance
n_feats_plot = len(feats)+1
plt.figure(figsize=(9, 6))
# plt.bar(np.arange(1, n_feats_plot), np.abs(clf.coef_[top_feat_idx].squeeze())) # SVM/logistic regression
plt.bar(np.arange(1, n_feats_plot), clf.feature_importances_[top_feat_idx]) # Tree based models
plt.xticks(np.arange(1, n_feats_plot), X_all.columns, rotation=90)
plt.title('Feature Importance plot')
plt.show()
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import EasyEnsemble
# seed = 0
# sm = SMOTE()
# X_train_new, y_train_new = sm.fit_sample(X_train, y_train)
# clf.fit(X_train_new, y_train_new)
# y_pred_train_new = clf.predict(X_train_new)
# y_pred_test = clf.predict(X_test)
# print "Pred Accuracy on train = ", accuracy_score(y_train_new, y_pred_train_new)
# print "Pred Accuracy on test = ", accuracy_score(y_test, y_pred_test)
# print classification_report(y_test, y_pred_test)
from random import randrange, choice
from sklearn.neighbors import NearestNeighbors
# Function to generate data samples using SMOTE
def SMOTE(T, N=100, k=3):
"""
Modified from
https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data
Returns (N/100) * n_minority_samples synthetic minority samples.
Parameters
----------
T : Pandas DataFrame, shape = [n_minority_samples, n_features]
Holds the minority samples
N : percentage of new synthetic samples:
n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
k : int. Number of nearest neighbours.
Returns
-------
S : array, shape = [(N/100) * n_minority_samples, n_features]
"""
n_minority_samples, n_features = T.shape
colnames = T.columns.values.tolist()
np.random.seed(0)
if N < 100:
#create synthetic samples only for a subset of T.
N = 100
pass
if (N % 100) != 0:
raise ValueError("N must be < 100 or multiple of 100")
N = N/100
n_synthetic_samples = N * n_minority_samples
S = np.zeros(shape=(n_synthetic_samples, n_features))
#Learn nearest neighbours
neigh = NearestNeighbors(n_neighbors = k)
neigh.fit(T)
#Calculate synthetic samples
for i in xrange(n_minority_samples):
nn = neigh.kneighbors(T.iloc[i].values.reshape([1, n_features]),\
return_distance=False)
np.random.seed(1)
for n in xrange(N):
nn_index = choice(nn[0])
#NOTE: nn includes T.iloc[i], we don't want to select it
np.random.seed(0)
while nn_index == i:
nn_index = choice(nn[0])
np.random.seed(0)
dif = T.iloc[nn_index] - T.iloc[i]
gap = np.random.random()
S[n + i * N, :] = T.iloc[i,:] + gap * dif[:]
S = pd.DataFrame(S)
S.columns = T.columns.values.tolist()
return S
# # Combine X and y, remove the symbol name column
# data_all = data.iloc[:, 1:n_feats+1]
# print data_all.shape
# Generate samples using only X_train
data_train = X_train.copy()
data_train['Vectorizable'] = y_train
data_train.shape
# new_samples = SMOTE(data_train)
# Load already generated samples
new_samples = pd.read_csv('Intermediate/smote_samples.csv')
new_samples = new_samples.drop('Unnamed: 0', axis=1)
print new_samples.shape
# Compare the distributions of the training set and the new data samples
X_train.describe()
new_samples.describe()
# Distribution of the target variable
new_samples.loc[new_samples['Vectorizable'] > 0.5, 'Vectorizable'] = 1
new_samples.loc[new_samples['Vectorizable'] <= 0.5, 'Vectorizable'] = 0
new_samples['Vectorizable'].value_counts()
# Combine the data
data_combined = pd.concat((data_train, new_samples))
print data_combined.shape
data_combined.head()
# Test the predictive models
# clf = RandomForestClassifier(random_state = rseed)
# clf = LogisticRegression(class_weight='balanced', C=0.1)
clf = KNeighborsClassifier(n_neighbors=5) #, weights = 'distance')
X_all_new, y_all_new = data_combined.iloc[:, :n_feats], data_combined.iloc[:, n_feats]
print X_all_new.shape, y_all_new.shape
cross_val_scores_new = cross_val_score(clf, X_all_new, y_all_new, cv=20)
print "Individual cross val scores =", cross_val_scores_new
print "Mean cross val score =", np.mean(cross_val_scores_new)
np.random.seed(1)
clf.fit(X_all_new, y_all_new)
y_pred = clf.predict(X_all)
y_pred_test = clf.predict(X_test)
print "Pred Accuracy on original data = ", accuracy_score(y_all, y_pred)
print "Pred Accuracy on test data = ", accuracy_score(y_test, y_pred_test)
print classification_report(y_all, y_pred)
# Store the generated data samples
# new_samples.to_csv('Intermediate/smote_samples.csv')
References
https://cran.r-project.org/web/packages/synthpop/vignettes/synthpop.pdf
https://www.jstatsoft.org/article/view/v074i11
# Store the training data (use it in the R code to generate data samples using synthpop)
data_train.to_csv('Intermediate/data_train.csv')
# Load the synthetically generated data
syn_data = pd.read_csv('Intermediate/syn_data2.csv')
syn_data = syn_data.drop('Unnamed: 0', axis=1)
print syn_data.shape
syn_data.head()
syn_data.columns = data_train.columns # Column names might have become a bit messed up in R
data_combined = pd.concat((data_train, syn_data))
print data_combined.shape
data_combined.head()
# Test predictive models
# clf = RandomForestClassifier(random_state = rseed)
# clf = LogisticRegression(class_weight='balanced', C=0.1)
clf = KNeighborsClassifier(n_neighbors=5) #, weights = 'distance')
X_all_new, y_all_new = data_combined.iloc[:, :n_feats], data_combined.iloc[:, n_feats]
print X_all_new.shape, y_all_new.shape
cross_val_scores_new = cross_val_score(clf, X_all_new, y_all_new, cv=20)
print "Individual cross val scores =", cross_val_scores_new
print "Mean cross val score =", np.mean(cross_val_scores_new)
np.random.seed(1)
clf.fit(X_all_new, y_all_new)
y_pred = clf.predict(X_all)
y_pred_test = clf.predict(X_test)
print "Pred Accuracy on original data = ", accuracy_score(y_all, y_pred)
print "Pred Accuracy on test data = ", accuracy_score(y_test, y_pred_test)
print classification_report(y_all, y_pred)