Source code for strkernel.motifkernel

#!/usr/bin/env python3
'''
Motif Kernel Module.
'''
#standard libraries
import re

# own libraries
from strkernel.lib.motiftrie import MotifTrie

# 3rd party libraries
import numpy as np
from scipy.sparse import csr_matrix


[docs]class motifKernel: """ This is the main class used for the motif kernel construction. The idea is to construct a Trie from a set of Motifs and then use this Trie to compute the motif content of a sequnece. The motif content can then be used to compute the similarity between sequences and therefore is able to serve as input for machine learning algorithms. Example: An elaborate example can be found in the ``Tutorials`` sections. Standard Use:: motifKernel(motifs) """ def __init__(self, motifs: [str]): self.motif_trie = motif_trie = MotifTrie(motifs)
[docs] def compute_matrix(self, sequences: [str], include_flanking: bool = True, return_kernel_matrix: bool = False): """ Computes the motif content of a set of sequences and returns a sparse matrix which can be used as input for machine learning approaches. The sparse matrix has only been tested with algorithms from the python package *sklearn*. Optional parameters allow the computation of a kernel matrix and inclusion of flanking regions. Args: **sequences:** A list of strings that are of the same alphabet as the motifs used to construct the motif Trie. **include_flanking:** Option to include or disregard the flanking regions. Default is True. **return_kernel_matrix:** A boolean value that indicates if the function should return a sparse matrix with the similarities between sequences (True) or a sparse matrix where each row contains the motif content of a sequence (False). Default is False. Returns: **csr_matrix:** A sparse matrix object containg either the kernel matrix (*return_kernel_matrix* = True) or the motif content of each sequence. """ if include_flanking: sequences = [seq.upper() for seq in sequences] else: sequences = [re.sub('[^A-Z]', '', seq) for seq in sequences] search_results = [self.motif_trie.check_for_motifs(sequence) for sequence in sequences] if return_kernel_matrix: kernel_matrix = csr_matrix(np.einsum('ij,kj->ik', search_results,search_results)) return kernel_matrix else: return csr_matrix(search_results)