Source code for pypath.utils.seq

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

"""
Utilities for working with protein sequences.
"""

from __future__ import annotations

from future.utils import iteritems

import os
import sys
import re
import collections

import pypath.share.common as common
import pypath.inputs.pfam as pfam_input
import pypath.inputs.uniprot as uniprot_input
import pypath.inputs.uniprot_db as uniprot_db
import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.utils.taxonomy as taxonomy


[docs] def swissprot_seq( organism: str | int = 9606, reviewed: bool = True, isoforms: bool = False, ) -> dict[str, Seq]: """ All UniProt sequences for an organism. Loads all sequences for an organism, optionally for all isoforms, by default only first isoform. Args: organism: Name or NCBI Taxonomy ID of the organism. reviewed: Load only reviewed (SwissProt) sequences. isoforms: Load all isoforms, not only the first one. Returns: A dict with UniProt IDs as keys and `Seq` objects as values. """ data = uniprot_input.uniprot_data( fields = 'sequence', organism = organism, reviewed = reviewed or None, ) result = {u: Seq(u, s) for u, s in data.items()} if isoforms: data = get_isoforms(organism = organism) for unip, isoforms in iteritems(data): for isof, seq in iteritems(isoforms): if unip in result: result[unip].add_seq(seq, isof) return result
[docs] def get_isoforms(organism = 9606): """ Loads UniProt sequences for all isoforms. """ if organism in taxonomy.phosphoelm_taxids: organism = taxonomy.phosphoelm_taxids[organism] reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)') result = {} url = urls.urls['unip_iso']['url'] c = curl.Curl(url, silent=False) data = c.result data = read_fasta(data) for header, seq in iteritems(data): org = reorg.findall(header) if len(org) > 0 and org[0] == organism: prot = header.split('|')[1].split('-') unip = prot[0] isof = int(prot[1]) if unip not in result: result[unip] = {} result[unip][isof] = seq return result
[docs] def read_fasta(fasta): """ Parses a fasta file. Returns dict with headers as keys and sequences as values. """ result = {} fasta = re.split(r'\n>', fasta) for section in fasta: section = section.strip().split('\n') label = section.pop(0) seq = ''.join(section) result[label] = seq return result
[docs] class Resource(object):
[docs] def __init__(self, loader, name = None): """ Represents a resource of sequence features, e.g. domains, motifs or post-translational modification sites. """ self.loader = loader self.db = {} self.name = name
[docs] def load(self, ncbi_tax_id = 9606): """ Loads the data from the resource for a given organism. """ if ncbi_tax_id not in self.db: self.db[ncbi_tax_id] = list( self.processor(self.loader(ncbi_tax_id = ncbi_tax_id)) )
[docs] def unload(self, ncbi_tax_id = None): """ Removes data in order to free up memory. """ if ncbi_tax_id in self.db: del self.db[ncbi_tax_id] elif ncbi_tax_id is None: self.db = {}
[docs] def processor(self, raw): """ Preprocesses the features loaded from a resource. """ for feature in raw: yield feature
[docs] def iterprotein(self, uniprot, ncbi_tax_id = 9606): """ Iterates over the features of one protein. """ self.load(ncbi_tax_id) if uniprot in self.db[ncbi_tax_id]: for feature in self.db[ncbi_tax_id][uniprot]: yield feature
[docs] def iterdb(self, ncbi_tax_id = 9606): """ Iterates over all proteins and features of one organism. """ self.load(ncbi_tax_id) for uniprot in self.db[ncbi_tax_id]: for feature in self.iterprotein(uniprot, ncbi_tax_id): yield (uniprot, feature)
[docs] class Pfam(Resource):
[docs] def __init__(self): """ Provides Pfam domains as sequence features. """ Resource.__init__(self, loader = None, name = 'Pfam') def loader(ncbi_tax_id = 9606): all_up = uniprot_db.all_uniprots(organism = ncbi_tax_id) return ( pfam_input.pfam_regions( uniprots = all_up, value = 'uniprot', keepfile = True, ) ) self.loader = loader
[docs] class Seq(object):
[docs] def __init__(self, protein, sequence, isoform=1): """ This class is to look up or match residues and regions in sequences of proteins, by default in the canonical sequence, and optionally in other isoforms. """ self.isof = {} self.protein = protein self.canonical = isoform self.add_seq(sequence, isoform)
def add_seq(self, sequence, isoform): self.isof[isoform] = sequence def match(self, pattern, start, end=None, isoform=None): instance = self.get(start, end, isoform) pattern = pattern.upper() return instance == pattern def get(self, start, end=None, isoform=None): isoform = isoform if isoform is not None else self.canonical end = end if end is not None else start return None if len(self.isof[isoform]) < max(start, end) or min(start, end) < 1 \ else self.isof[isoform][start - 1:end] def isoforms(self): return list(self.isof.keys()) def has_isoform(self, isoform): return isoform in self.isof def get_region(self, residue=None, start=None, end=None, flanking=7, isoform=None): isoform = self.canonical if isoform is None else isoform if residue is None and start is None and end is None: return (1, len(self.isof[isoform]), self.isof[isoform]) if residue is not None and residue > len(self.isof[isoform]): return (None, None, None) start = start if start is not None else residue - flanking end = end if end is not None else residue + flanking start = max(start, 1) end = min(end, len(self.isof[isoform])) return (start, end, self.isof[isoform][start - 1:end])
[docs] def findall(self, fragment): """ Looks up a sequence fragment in the sequences. Yields tuples of isoform and offset. """ SeqLookup = collections.namedtuple( 'SeqLookup', ['isoform', 'offset'], ) for iso, se in iteritems(self.isof): offset = 0 while True: offset = se.find(fragment, offset) if offset == -1: break yield SeqLookup(iso, offset) offset += 1
def get_biopython(self, isoform = 1): isoform = int(isoform) if isoform not in self.isof: raise ValueError('No isoform %u available for protein `%s`.' % ( isoform, self.protein)) try: import Bio.Seq import Bio.SeqRecord srec = Bio.SeqRecord.SeqRecord( Bio.Seq.Seq(self.isof[isoform], Bio.Alphabet.ProteinAlphabet()), id = self.protein ) srec.annotations['isoform'] = isoform return srec except ImportError: sys.stdout.write('\t:: Module `Bio` (biopython)'\ 'could not be imported.\n') sys.stdout.flush() def export_fasta(self, fname = None, sequences = None): sequences = sequences or [self] fname = fname or '%s.fasta' % self.protein try: import Bio.SeqIO Bio.SeqIO.write([s.get_biopython() for s in sequences], fname, 'fasta') except ImportError: sys.stdout.write('\t:: Module `Bio` (biopython)'\ 'could not be imported.\n') sys.stdout.flush() def multiple_alignment(self, sequences, outfile = None, method = 'ClustalW', param = {}): try: import Bio.Align.Applications except ImportError: sys.stdout.write('\t:: Module `Bio` (biopython)'\ 'could not be imported.\n') sys.stdout.flush() return session = common.random_string() method = method.capitalize() infile = os.path.join('cache', '_align.%s.tmp.fasta' % session) keep_outfile = outfile is not None outfile = outfile or os.path.join('cache', '_align.%s.tmp.aln' % session) self.export_fasta(infile, sequences) if method == 'Muscle': if 'clw' not in param: param['clw'] = True param['input'] = infile param['out'] = outfile if method == 'Clustalomega': method = 'ClustalOmega' param['infile'] = infile param['outfile'] = outfile if method == 'Clustalw': param['cmd'] = 'clustalw2' param['infile'] = infile param['outfile'] = outfile app = getattr(Bio.Align.Applications, '%sCommandline' % method) cmd = app(**param) cmd() os.remove(infile) import Bio.AlignIO if method.lower() == 'clustalw' or param['clw']: aln = Bio.AlignIO.read(outfile, 'clustal') else: aln = None if not keep_outfile: os.remove(outfile) return aln