Gappy Kernel / SVM TutorialΒΆ

[1]:
import time
from strkernel import gappy_kernel as gk
from strkernel import gappy_trie as gt
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from Bio.Seq import Seq
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve, precision_score
[3]:
# Reads in files
def read(fname):
    sequences=[]
    with open(fname,'r') as f:
        for line in f:
            if line[0]!='>':
                sequences.append(line.split()[0])
    return sequences

pos=[Seq(x) for x in read('notebook_data/positive_IGF2BP123.fasta')]
neg=[Seq(x) for x in read('notebook_data/negative_IGF2BP123.fasta')]
start = time.time()
k=1
g=2
start = time.time()
spectrum_pos = gk.gappypair_kernel(pos, k=k, g=g,  include_flanking=False, gapDifferent = True, sparse = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
spectrum_neg = gk.gappypair_kernel(neg,k=k, g=g,  include_flanking=False, gapDifferent = True, sparse = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
spectrum_pos3 = gk.gappypair_kernel(pos, k=k, g=g,  include_flanking=False, gapDifferent = False, sparse = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
spectrum_neg3 = gk.gappypair_kernel(neg, k=k, g=g,  include_flanking=False, gapDifferent = False, sparse = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
Calculated 1-gappypair in 12.433693647384644 seconds
Calculated 1-gappypair in 12.717294216156006 seconds
Calculated 1-gappypair in 12.820235013961792 seconds
Calculated 1-gappypair in 12.454049587249756 seconds
[4]:
from scipy.sparse import coo_matrix, vstack
y = np.concatenate((np.ones(spectrum_pos.shape[0]), -np.ones(spectrum_neg.shape[0])))
X =  vstack([spectrum_pos,spectrum_neg]).toarray()
y3 = np.concatenate((np.ones(spectrum_pos3.shape[0]), -np.ones(spectrum_neg3.shape[0])))
X3 =  vstack([spectrum_pos3,spectrum_neg3]).toarray()
[5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42,stratify=y)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y3,test_size=0.1,random_state=42,stratify=y3)
[6]:
start = time.time()
clf = SVC(C=0.1, kernel='linear', probability=True)
clf.fit(X_train, y_train)
print ("Trained linear SVM on {}-spectrum in {} seconds".format(k, time.time() - start))
Trained linear SVM on 1-spectrum in 127.65080642700195 seconds
[7]:
start = time.time()
clf3 = SVC(C=0.1, kernel='linear', probability=True)
clf3.fit(X_train3, y_train3)
print ("Trained linear SVM on {}-spectrum in {} seconds".format(k, time.time() - start))
Trained linear SVM on 1-spectrum in 115.81935048103333 seconds
[8]:
y_score = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_score=y_score[:,1], y_true=y_test)
tpr, fpr, _ = roc_curve(y_score=y_score[:,1], y_true=y_test)
y_score3 = clf3.predict_proba(X_test3)
roc_auc3 = roc_auc_score(y_score=y_score3[:,1], y_true=y_test3)
tpr3, fpr3, _ = roc_curve(y_score=y_score3[:,1], y_true=y_test3)
fig = plt.figure(figsize=(14, 8))
plt.plot(tpr, fpr, label='Performance Gappy Pair simple Kernel (AUC: {:0.2f})'.format(roc_auc), lw=4, color='orange')
plt.plot(tpr3, fpr3, label='Performance Gappy Pair simple Kernel gapDifferent false (AUC: {:0.2f})'.format(roc_auc3),lw=4, color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1], color='navy', lw=4, linestyle='--', label='random')
plt.legend(loc='lower right')
fig.savefig('./Gappy_Kernel.pdf')
_images/Gappy_Kernel_IGF_Example_7_0.png
[9]:
# Necessary, because the gappy pair trie kernel at the moment can't handle N nucleotides
def remove_n(seqs):
    data=[]
    for x in seqs:
        if Seq('N') in x:
            pass
        else:
            data.append(x)
    return data

neg_without_n=remove_n(neg)

start = time.time()
spectrum_pos2 = gt.gappypair_kernel(pos, k, 0, g=g,  include_flanking=False, gapDifferent = False)
print ("Calculated {}-gappypair trie in {} seconds".format(k, time.time() - start))
start = time.time()
spectrum_neg2 = gt.gappypair_kernel(neg_without_n, k, 0, g=g,  include_flanking=False,gapDifferent = False)
print ("Calculated {}-gappypair trie in {} seconds".format(k, time.time() - start))
Calculated 1-gappypair trie in 234.3155553340912 seconds
Calculated 1-gappypair trie in 229.59363436698914 seconds
[6]:
from scipy.sparse import coo_matrix, vstack
X2 =  vstack([spectrum_pos2,spectrum_neg2]).toarray()
y2 =  np.concatenate((np.ones(spectrum_pos2.shape[0]), -np.ones(spectrum_neg2.shape[0])))
[7]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2,test_size=0.1,random_state=42,stratify=y2)
[8]:
start = time.time()
clf2 = SVC(C=0.1, kernel='linear', probability=True)
clf2.fit(X_train2, y_train2)
print ("Trained linear SVM on {}-spectrum in {} seconds".format(k, time.time() - start))
Trained linear SVM on 1-spectrum in 166.927264213562 seconds
[19]:
y_score2 = clf2.predict_proba(X_test2)
roc_auc2 = roc_auc_score(y_score=y_score2[:,1], y_true=y_test2)
tpr, fpr, _ = roc_curve(y_score=y_score2[:,1], y_true=y_test2)
fig = plt.figure(figsize=(14, 8))
plt.plot(tpr, fpr, label='Performance Gappy Pair trie Kernel (AUC: {:0.2f})'.format(roc_auc2),         lw=4, color='orange')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1], color='navy', lw=4, linestyle='--', label='random')
plt.legend(loc='lower right')
[19]:
<matplotlib.legend.Legend at 0x7f63aa9c3fd0>
_images/Gappy_Kernel_IGF_Example_12_1.png
[3]:
start = time.time()
gk.gappypair_kernel(pos, k=k, g=g,  include_flanking=False, reverse=True, gapDifferent = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
gk.gappypair_kernel(pos,k=k, g=g,  include_flanking=True, gapDifferent = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
gk.gappypair_kernel(pos, k=k, g=g,  include_flanking=False, reverse =True, gapDifferent = False)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
gk.gappypair_kernel(pos, k=k, g=g,  include_flanking=True, gapDifferent = False)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
gk.gappypair_kernel(pos, k=k, g=4,  include_flanking=False, gapDifferent = True)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
start = time.time()
gk.gappypair_kernel(pos, k=k, g=4,  include_flanking=False, gapDifferent = False)
print ("Calculated {}-gappypair in {} seconds".format(k, time.time() - start))
Calculated 1-gappypair in 21.7454092502594 seconds
Calculated 1-gappypair in 242.06059432029724 seconds
Calculated 1-gappypair in 21.176217317581177 seconds
Calculated 1-gappypair in 241.81857538223267 seconds
Calculated 1-gappypair in 32.68045377731323 seconds
Calculated 1-gappypair in 33.1010320186615 seconds
[ ]:

[3]:

[41]:

[ ]:

[ ]:

[ ]:

[ ]:

[ ]:

[ ]: