Source code for strkernel.gappy_kernel

#!/usr/bin/env python3
'''
Implementation of the gappy kernel.
'''
import numpy as np
import scipy
from Bio.Seq import Seq
from scipy.sparse import csr_matrix


sequenceTypes={'dna':0,'rna':1,'aa':2,'aa+s':3}
# DNA/RNA, Amino acids (all 20), Amino acids selenocystein
alphabets=['ACGT','ACGU','ACDEFGHIKLMNPQRSTVWY','ACDEFGHIKLMNPQRSTUVWY']

def get_numbers_for_sequence(sequence,t=0,reverse=False):
    try:
        ori=[alphabets[t].index(x) for x in sequence]
    except ValueError:
        return [-1]
    if reverse:
        rev=[alphabets[t].index(x) for x in sequence.reverse_complement()]
        if ori>rev:
            return rev
    return ori


def _extract_gappy_sequence(sequence, k, g,t=0,reverse=False):
    """Compute gappypair-spectrum for a given sequence, k-mer length k and
    gap length g. A 2*k-mer with gap is saved at the same position as a 2*k-mer
    without a gap.
    The idea is to first create a vector of the given size (4**(2*k)) and then
    transform each k-mer to a sequence of length k of numbers 0-3
    (0 = A, 1 = C, 2 = G, 3 = U).
    From there, we can multiply that sequence with a vector of length 2*k,
    containing the exponents of 4 to calculate the position in the spectrum.
    Example: AUUC -> 0331 -> 4**0*1 + 4**1*3 + 4**2*3 + 4**3*0
    """
    n = len(sequence)
    kk=2*k
    alphabet=len(alphabets[t])
    powersize=np.power(alphabet, (kk))
    multiplier = np.power(alphabet, range(kk))[::-1]
    spectrum = np.zeros((powersize))
    for pos in range(n - kk + 1):
            pos_in_spectrum = np.sum(multiplier * get_numbers_for_sequence(sequence[pos:pos+(kk)],t,reverse=reverse))
            spectrum[pos_in_spectrum] += 1
            for gap in range(1,g+1):
                if (pos+gap+kk)<=n:
                    pos_gap = np.sum(multiplier * get_numbers_for_sequence(sequence[pos:pos+k] + sequence[pos+k+gap:pos+gap+kk],t,reverse=reverse))
                    spectrum[pos_gap] += 1
    return spectrum

def _extract_spectrum_sequence(sequence, k,t=0,reverse=False):
    """Compute k-spectrum for a given sequence, k-mer length k.
    This method computes the spectrum for a given sequence and k-mer-length k.
    The idea is to first create a vector of the given size (4**k) and then
    transform each k-mer to a sequence of length k of numbers 0-3
    (0 = A, 1 = C, 2 = G, 3 = U).
    From there, we can multiply that sequence with a vector of length k,
    containing the exponents of 4 to calculate the position in the spectrum.
    Example: AUUC -> 0331 -> 4**0*1 + 4**1*3 + 4**2*3 + 4**3*0
    """
    n = len(sequence)
    alphabet=len(alphabets[t])
    spectrum = np.zeros(np.power(alphabet, k))
    multiplier = np.power(alphabet, range(k))[::-1]
    for pos in range(n - k + 1):
            pos_in_spectrum = np.sum(multiplier * get_numbers_for_sequence(sequence[pos:pos+k],t,reverse))
            spectrum[pos_in_spectrum] += 1
    return spectrum

def _extract_gappy_sequence_different(sequence, k, g,t=0,reverse=False):
    """Compute gappypair-spectrum for a given sequence, k-mer length k and
    gap length g. A 2*k-mer with a certain gap size is saved at a different
    position than the same 2*k-mer with no gaps or another number of gaps.
    """
    n = len(sequence)
    kk=2*k
    alphabet=len(alphabets[t])
    powersize=np.power(alphabet, (kk))
    multiplier = np.power(alphabet, range(kk))[::-1]
    spectrum = np.zeros((g+1)*(powersize))
    for pos in range(n - kk + 1):
            pos_in_spectrum = np.sum(multiplier * get_numbers_for_sequence(sequence[pos:pos+(kk)],t,reverse=reverse))
            spectrum[pos_in_spectrum] += 1
            if (pos+g+kk)<=n:
                for gap in range(1,g+1):
                    pos_gap = np.sum(multiplier * get_numbers_for_sequence(sequence[pos:pos+k] + sequence[pos+k+gap:pos+gap+kk],t,reverse=reverse))
                    spectrum[(gap*(powersize))+pos_gap] += 1
    return spectrum

[docs]def gappypair_kernel(sequences, k, g=0,t=0,sparse=True, reverse=False, include_flanking=False, gapDifferent = True): """Compute gappypair-kernel for a set of sequences using k-mer length k and gap size g. The result than can be used in a linear SVM or other classification algorithms. Parameters: ---------- sequences: A list of Biopython sequences k: Integer. The length of kmers to consider g: Integer. Gapps allowed. 0 by default. t: Which alphabet according to sequenceTypes. Assumes Dna (t=0). sparse: Boolean. Output as sparse matrix? True by default. reverse: Boolean. Reverse complement taken into account? False by default. include_flanking: Boolean. Include flanking regions? (the lower-case letters in the sequences given) gapDifferent: Boolean. If k-mers with different gaps should be threated differently or all the same. True by default. Returns: ------- A numpy array of shape (N, 4**k), containing the k-spectrum for each sequence. N is the number of sequences and k the length of k-mers considered. """ spectrum = [] for seq in sequences: if include_flanking: seq = seq.upper() else: seq = Seq("".join([x for x in seq if 'A' <= x <= 'Z'])) if (g>0) and gapDifferent: spectrum.append(_extract_gappy_sequence_different(seq, k, g, t = t, reverse = reverse)) elif g>0: spectrum.append(_extract_gappy_sequence(seq, k, g, t = t, reverse = reverse)) else: spectrum.append(_extract_spectrum_sequence(seq, k, t = t, reverse = reverse)) if sparse: return csr_matrix(spectrum) return np.array(spectrum)