#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#"""Utilities for working with protein sequences."""from__future__importannotationsfromfuture.utilsimportiteritemsimportosimportsysimportreimportcollectionsimportpypath.share.commonascommonimportpypath.inputs.pfamaspfam_inputimportpypath.inputs.uniprotasuniprot_inputimportpypath.inputs.uniprot_dbasuniprot_dbimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.utils.taxonomyastaxonomy
[docs]defswissprot_seq(organism:str|int=9606,reviewed:bool=True,isoforms:bool=False,)->dict[str,Seq]:""" All UniProt sequences for an organism. Loads all sequences for an organism, optionally for all isoforms, by default only first isoform. Args: organism: Name or NCBI Taxonomy ID of the organism. reviewed: Load only reviewed (SwissProt) sequences. isoforms: Load all isoforms, not only the first one. Returns: A dict with UniProt IDs as keys and `Seq` objects as values. """data=uniprot_input.uniprot_data(fields='sequence',organism=organism,reviewed=reviewedorNone,)result={u:Seq(u,s)foru,sindata.items()}ifisoforms:data=get_isoforms(organism=organism)forunip,isoformsiniteritems(data):forisof,seqiniteritems(isoforms):ifunipinresult:result[unip].add_seq(seq,isof)returnresult
[docs]defget_isoforms(organism=9606):""" Loads UniProt sequences for all isoforms. """iforganismintaxonomy.phosphoelm_taxids:organism=taxonomy.phosphoelm_taxids[organism]reorg=re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)')result={}url=urls.urls['unip_iso']['url']c=curl.Curl(url,silent=False)data=c.resultdata=read_fasta(data)forheader,seqiniteritems(data):org=reorg.findall(header)iflen(org)>0andorg[0]==organism:prot=header.split('|')[1].split('-')unip=prot[0]isof=int(prot[1])ifunipnotinresult:result[unip]={}result[unip][isof]=seqreturnresult
[docs]defread_fasta(fasta):""" Parses a fasta file. Returns dict with headers as keys and sequences as values. """result={}fasta=re.split(r'\n>',fasta)forsectioninfasta:section=section.strip().split('\n')label=section.pop(0)seq=''.join(section)result[label]=seqreturnresult
[docs]def__init__(self,loader,name=None):""" Represents a resource of sequence features, e.g. domains, motifs or post-translational modification sites. """self.loader=loaderself.db={}self.name=name
[docs]defload(self,ncbi_tax_id=9606):""" Loads the data from the resource for a given organism. """ifncbi_tax_idnotinself.db:self.db[ncbi_tax_id]=list(self.processor(self.loader(ncbi_tax_id=ncbi_tax_id)))
[docs]defunload(self,ncbi_tax_id=None):""" Removes data in order to free up memory. """ifncbi_tax_idinself.db:delself.db[ncbi_tax_id]elifncbi_tax_idisNone:self.db={}
[docs]defprocessor(self,raw):""" Preprocesses the features loaded from a resource. """forfeatureinraw:yieldfeature
[docs]defiterprotein(self,uniprot,ncbi_tax_id=9606):""" Iterates over the features of one protein. """self.load(ncbi_tax_id)ifuniprotinself.db[ncbi_tax_id]:forfeatureinself.db[ncbi_tax_id][uniprot]:yieldfeature
[docs]defiterdb(self,ncbi_tax_id=9606):""" Iterates over all proteins and features of one organism. """self.load(ncbi_tax_id)foruniprotinself.db[ncbi_tax_id]:forfeatureinself.iterprotein(uniprot,ncbi_tax_id):yield(uniprot,feature)
[docs]def__init__(self,protein,sequence,isoform=1):""" This class is to look up or match residues and regions in sequences of proteins, by default in the canonical sequence, and optionally in other isoforms. """self.isof={}self.protein=proteinself.canonical=isoformself.add_seq(sequence,isoform)
[docs]deffindall(self,fragment):""" Looks up a sequence fragment in the sequences. Yields tuples of isoform and offset. """SeqLookup=collections.namedtuple('SeqLookup',['isoform','offset'],)foriso,seiniteritems(self.isof):offset=0whileTrue:offset=se.find(fragment,offset)ifoffset==-1:breakyieldSeqLookup(iso,offset)offset+=1
defget_biopython(self,isoform=1):isoform=int(isoform)ifisoformnotinself.isof:raiseValueError('No isoform %u available for protein `%s`.'%(isoform,self.protein))try:importBio.SeqimportBio.SeqRecordsrec=Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(self.isof[isoform],Bio.Alphabet.ProteinAlphabet()),id=self.protein)srec.annotations['isoform']=isoformreturnsrecexceptImportError:sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')sys.stdout.flush()defexport_fasta(self,fname=None,sequences=None):sequences=sequencesor[self]fname=fnameor'%s.fasta'%self.proteintry:importBio.SeqIOBio.SeqIO.write([s.get_biopython()forsinsequences],fname,'fasta')exceptImportError:sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')sys.stdout.flush()defmultiple_alignment(self,sequences,outfile=None,method='ClustalW',param={}):try:importBio.Align.ApplicationsexceptImportError:sys.stdout.write('\t:: Module `Bio` (biopython)'\
'could not be imported.\n')sys.stdout.flush()returnsession=common.random_string()method=method.capitalize()infile=os.path.join('cache','_align.%s.tmp.fasta'%session)keep_outfile=outfileisnotNoneoutfile=outfileoros.path.join('cache','_align.%s.tmp.aln'%session)self.export_fasta(infile,sequences)ifmethod=='Muscle':if'clw'notinparam:param['clw']=Trueparam['input']=infileparam['out']=outfileifmethod=='Clustalomega':method='ClustalOmega'param['infile']=infileparam['outfile']=outfileifmethod=='Clustalw':param['cmd']='clustalw2'param['infile']=infileparam['outfile']=outfileapp=getattr(Bio.Align.Applications,'%sCommandline'%method)cmd=app(**param)cmd()os.remove(infile)importBio.AlignIOifmethod.lower()=='clustalw'orparam['clw']:aln=Bio.AlignIO.read(outfile,'clustal')else:aln=Noneifnotkeep_outfile:os.remove(outfile)returnaln