#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
"""
Utilities for working with protein sequences.
"""
from __future__ import annotations
from future.utils import iteritems
import os
import sys
import re
import collections
import pypath.share.common as common
import pypath.inputs.pfam as pfam_input
import pypath.inputs.uniprot as uniprot_input
import pypath.inputs.uniprot_db as uniprot_db
import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.utils.taxonomy as taxonomy
[docs]
def swissprot_seq(
organism: str | int = 9606,
reviewed: bool = True,
isoforms: bool = False,
) -> dict[str, Seq]:
"""
All UniProt sequences for an organism.
Loads all sequences for an organism, optionally
for all isoforms, by default only first isoform.
Args:
organism:
Name or NCBI Taxonomy ID of the organism.
reviewed:
Load only reviewed (SwissProt) sequences.
isoforms:
Load all isoforms, not only the first one.
Returns:
A dict with UniProt IDs as keys and `Seq` objects
as values.
"""
data = uniprot_input.uniprot_data(
fields = 'sequence',
organism = organism,
reviewed = reviewed or None,
)
result = {u: Seq(u, s) for u, s in data.items()}
if isoforms:
data = get_isoforms(organism = organism)
for unip, isoforms in iteritems(data):
for isof, seq in iteritems(isoforms):
if unip in result:
result[unip].add_seq(seq, isof)
return result
[docs]
def read_fasta(fasta):
"""
Parses a fasta file.
Returns dict with headers as keys and sequences as values.
"""
result = {}
fasta = re.split(r'\n>', fasta)
for section in fasta:
section = section.strip().split('\n')
label = section.pop(0)
seq = ''.join(section)
result[label] = seq
return result
[docs]
class Resource(object):
[docs]
def __init__(self, loader, name = None):
"""
Represents a resource of sequence features,
e.g. domains, motifs or post-translational
modification sites.
"""
self.loader = loader
self.db = {}
self.name = name
[docs]
def load(self, ncbi_tax_id = 9606):
"""
Loads the data from the resource for a given organism.
"""
if ncbi_tax_id not in self.db:
self.db[ncbi_tax_id] = list(
self.processor(self.loader(ncbi_tax_id = ncbi_tax_id))
)
[docs]
def unload(self, ncbi_tax_id = None):
"""
Removes data in order to free up memory.
"""
if ncbi_tax_id in self.db:
del self.db[ncbi_tax_id]
elif ncbi_tax_id is None:
self.db = {}
[docs]
def processor(self, raw):
"""
Preprocesses the features loaded from a resource.
"""
for feature in raw:
yield feature
[docs]
def iterprotein(self, uniprot, ncbi_tax_id = 9606):
"""
Iterates over the features of one protein.
"""
self.load(ncbi_tax_id)
if uniprot in self.db[ncbi_tax_id]:
for feature in self.db[ncbi_tax_id][uniprot]:
yield feature
[docs]
def iterdb(self, ncbi_tax_id = 9606):
"""
Iterates over all proteins and features of one organism.
"""
self.load(ncbi_tax_id)
for uniprot in self.db[ncbi_tax_id]:
for feature in self.iterprotein(uniprot, ncbi_tax_id):
yield (uniprot, feature)
[docs]
class Pfam(Resource):
[docs]
def __init__(self):
"""
Provides Pfam domains as sequence features.
"""
Resource.__init__(self, loader = None, name = 'Pfam')
def loader(ncbi_tax_id = 9606):
all_up = uniprot_db.all_uniprots(organism = ncbi_tax_id)
return (
pfam_input.pfam_regions(
uniprots = all_up,
value = 'uniprot',
keepfile = True,
)
)
self.loader = loader
[docs]
class Seq(object):
[docs]
def __init__(self, protein, sequence, isoform=1):
"""
This class is to look up or match
residues and regions in sequences of
proteins, by default in the canonical
sequence, and optionally in other isoforms.
"""
self.isof = {}
self.protein = protein
self.canonical = isoform
self.add_seq(sequence, isoform)
def add_seq(self, sequence, isoform):
self.isof[isoform] = sequence
def match(self, pattern, start, end=None, isoform=None):
instance = self.get(start, end, isoform)
pattern = pattern.upper()
return instance == pattern
def get(self, start, end=None, isoform=None):
isoform = isoform if isoform is not None else self.canonical
end = end if end is not None else start
return None if len(self.isof[isoform]) < max(start, end) or min(start, end) < 1 \
else self.isof[isoform][start - 1:end]
def isoforms(self):
return list(self.isof.keys())
def has_isoform(self, isoform):
return isoform in self.isof
def get_region(self,
residue=None,
start=None,
end=None,
flanking=7,
isoform=None):
isoform = self.canonical if isoform is None else isoform
if residue is None and start is None and end is None:
return (1, len(self.isof[isoform]), self.isof[isoform])
if residue is not None and residue > len(self.isof[isoform]):
return (None, None, None)
start = start if start is not None else residue - flanking
end = end if end is not None else residue + flanking
start = max(start, 1)
end = min(end, len(self.isof[isoform]))
return (start, end, self.isof[isoform][start - 1:end])
[docs]
def findall(self, fragment):
"""
Looks up a sequence fragment in the sequences.
Yields tuples of isoform and offset.
"""
SeqLookup = collections.namedtuple(
'SeqLookup',
['isoform', 'offset'],
)
for iso, se in iteritems(self.isof):
offset = 0
while True:
offset = se.find(fragment, offset)
if offset == -1:
break
yield SeqLookup(iso, offset)
offset += 1
def get_biopython(self, isoform = 1):
isoform = int(isoform)
if isoform not in self.isof:
raise ValueError('No isoform %u available for protein `%s`.' % (
isoform, self.protein))
try:
import Bio.Seq
import Bio.SeqRecord
srec = Bio.SeqRecord.SeqRecord(
Bio.Seq.Seq(self.isof[isoform],
Bio.Alphabet.ProteinAlphabet()),
id = self.protein
)
srec.annotations['isoform'] = isoform
return srec
except ImportError:
sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')
sys.stdout.flush()
def export_fasta(self, fname = None, sequences = None):
sequences = sequences or [self]
fname = fname or '%s.fasta' % self.protein
try:
import Bio.SeqIO
Bio.SeqIO.write([s.get_biopython() for s in sequences],
fname, 'fasta')
except ImportError:
sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')
sys.stdout.flush()
def multiple_alignment(self, sequences, outfile = None,
method = 'ClustalW', param = {}):
try:
import Bio.Align.Applications
except ImportError:
sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')
sys.stdout.flush()
return
session = common.random_string()
method = method.capitalize()
infile = os.path.join('cache', '_align.%s.tmp.fasta' % session)
keep_outfile = outfile is not None
outfile = outfile or os.path.join('cache',
'_align.%s.tmp.aln' % session)
self.export_fasta(infile, sequences)
if method == 'Muscle':
if 'clw' not in param:
param['clw'] = True
param['input'] = infile
param['out'] = outfile
if method == 'Clustalomega':
method = 'ClustalOmega'
param['infile'] = infile
param['outfile'] = outfile
if method == 'Clustalw':
param['cmd'] = 'clustalw2'
param['infile'] = infile
param['outfile'] = outfile
app = getattr(Bio.Align.Applications, '%sCommandline' % method)
cmd = app(**param)
cmd()
os.remove(infile)
import Bio.AlignIO
if method.lower() == 'clustalw' or param['clw']:
aln = Bio.AlignIO.read(outfile, 'clustal')
else:
aln = None
if not keep_outfile:
os.remove(outfile)
return aln