#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsimportsysimportreimportcollectionsimportitertoolsimportbs4importcsvimportpypath.inputs.commonasinputs_commonimportpypath.share.progressasprogressimportpypath.utils.taxonomyastaxonomyimportpypath.utils.mappingasmappingimportpypath.internals.interaasinteraimportpypath.share.curlascurlimportpypath.resources.urlsasurlsimportpypath.share.commonascommon
[docs]defsignor_interactions(organism=9606,raw_records=False,expand_families=0):""" Downloads the full dataset from SIGNOR (https://signor.uniroma2.it/). Returns the records with the most important fields. If ``raw_records`` is `True` it returns the table split to list of lists but unchanged content. Args organism (int, str): The NCBI Taxonomy ID or name of the organism. Human (9606), mouse (10090) and rat (10116) are available. raw_records (bool): Process the records or return them raw, as they are. expand_families (int): Expand protein families up to this size. Zero or one means no expansion. Return list: A list with processed records as named tuples or dicts of raw records if ``raw_records`` is True. """defprocess_name(name):isoform=Noneifnameinfamilies:main=(families[name]iflen(families[name])<=expand_familieselse())elifnameincomplexes_by_id:main=complexes_by_id[name]else:main,isoform=inputs_common._try_isoform(name)main=(main,)returnmain,isoformSignorInteraction=collections.namedtuple('SignorInteraction',('source','target','source_isoform','target_isoform','source_type','target_type','effect','mechanism','ncbi_tax_id','pubmeds','direct','ptm_type','ptm_residue','ptm_motif',))families=signor_protein_families(organism=organism)complexes=signor_complexes(organism=organism)complexes_by_id=collections.defaultdict(set)forcplexincomplexes.values():forcplex_idincplex.ids['SIGNOR']:complexes_by_id[cplex_id].add(cplex)ifisinstance(organism,int):iforganismintaxonomy.taxids:_organism=taxonomy.taxids[organism]else:sys.stdout.write('\t:: Unknown organism: `%u`.\n'%organism)return[]else:_organism=organismif_organismnotin{'human','rat','mouse'}:return[]url=urls.urls['signor']['all_url_new']binary_data=[(b'organism',_organism.encode('utf-8')),(b'format',b'csv'),(b'submit',b'Download'),]c=curl.Curl(url,silent=False,large=True,follow=True,timeout=180,binary_data=binary_data,return_headers=True,)reader=csv.DictReader(c.result,delimiter='\t')ifraw_records:returnlist(reader)result=[]forlineinreader:sources,source_isoform=process_name(line['IDA'])targets,target_isoform=process_name(line['IDB'])forsource,targetinitertools.product(sources,targets):this_record=SignorInteraction(source=source,target=target,source_isoform=source_isoform,target_isoform=target_isoform,source_type=line['TYPEA'],target_type=line['TYPEB'],effect=line['EFFECT'],mechanism=line['MECHANISM'],ncbi_tax_id=line['TAX_ID'],pubmeds=line['PMID'],direct=line['DIRECT']=='YES',ptm_type=line['MECHANISM'],ptm_residue=line['RESIDUE'],ptm_motif=line['SEQUENCE'],)result.append(this_record)returnresult
[docs]defsignor_enzyme_substrate(organism=9606):""" Loads and processes Signor PTMs. Returns dict of dicts. """reres=re.compile(r'([A-Za-z]{3})([0-9]+)')result=[]aalet=dict((k.lower().capitalize(),v)fork,viniteritems(common.aaletters))data=signor_interactions(organism=organism)fordindata:resm=reres.match(d.ptm_residue)ifresmisnotNone:aa=aalet[resm.groups()[0].capitalize()]aanum=int(resm.groups()[1])typ=d.ptm_type,inst=d.ptm_motif.upper()result.append({'typ':d.ptm_type,'resnum':aanum,'instance':inst,'substrate':d.target,'start':aanum-7,'end':aanum+7,'kinase':d.source,'resaa':aa,'motif':inst,'enzyme_isoform':d.source_isoform,'substrate_isoform':d.target_isoform,'references':{d.pubmeds}ifd.pubmeds!='Other'elseset()})returnresult
[docs]defsignor_pathways(**kwargs):""" Obtains pathway annotations from Signor. """url=urls.urls['signor']['list_url']baseurl=urls.urls['signor']['all_url_new']proteins_pathways={}interactions_pathways={}c=curl.Curl(url,silent=True)soup=bs4.BeautifulSoup(c.result,'html.parser')pathway_names=[(opt['value'],opt.text)foroptinsoup.find('select',{'name':'pathway_list'}).findAll('option')]prg=progress.Progress(len(pathway_names),'Downloading data from Signor',1,percent=False)forshort,fullinpathway_names:prg.step()ifnotshort:continuebinary_data=[(b'pathway_list',short.encode('ascii')),(b'submit',b'Download')]c_pw=curl.Curl(baseurl,silent=True,binary_data=binary_data,encoding='utf-8',)#csv.DictReader(c_pw.result)sep='@#@#@'lines=inputs_common.csv_sep_change(c_pw.result,'\t',sep).split('\n')[1:]data=list(filter(lambdal:len(l)>6,map(lambdal:l.strip().split(sep),lines)))proteins_pathways[full]=set()interactions_pathways[full]=set()forrowindata:foruniprot1,uniprot2initertools.product(mapping.map_name(row[4],'uniprot','uniprot'),mapping.map_name(row[8],'uniprot','uniprot'),):proteins_pathways[full].add(uniprot1)proteins_pathways[full].add(uniprot2)interactions_pathways[full].add((uniprot1,uniprot2))prg.terminate()returnproteins_pathways,interactions_pathways
[docs]defsignor_protein_families(organism=9606):#TODO: implement organismfamilies={}url=urls.urls['signor']['complexes']c=curl.Curl(url,binary_data=[(b'submit',b'Download protein family data')],large=True,)_=next(c.result)forrecinc.result:rec=rec.split(';')components=[u.strip('\n\r" ')foruinrec[2].split(',')]families[rec[0]]=componentsreturnfamilies
[docs]defsignor_complexes(organism=9606):#TODO: implement organismdefprocess_on_hold(on_hold,complexes_by_id,complexes):on_hold_next=[]forname,components,id_inon_hold:components=[[comp.componentsforcompincomplexes_by_id[comp_id]]ifcomp_idincomplexes_by_idelse((comp_id,),)forcomp_idincomponents]forcomponents0initertools.product(*components):this_components=list(itertools.chain(*components0))ifany(comp.startswith('SIGNOR-C')forcompinthis_components):on_hold_next.append((name,this_components,id_))else:cplex=intera.Complex(name=name.replace('"','').strip(),components=this_components,sources='SIGNOR',ids=id_,)complexes[cplex.__str__()]=cplexcomplexes_by_id[id_].add(cplex)returnon_hold_next,complexes_by_id,complexescomplexes={}on_hold=[]families=signor_protein_families(organism=organism)url=urls.urls['signor']['complexes']c=curl.Curl(url,binary_data=[(b'submit',b'Download complex data')],large=True,)_=next(c.result)complexes_by_id=collections.defaultdict(set)forrecinc.result:rec=rec.split(';')components=[u.strip('\n\r" ')foruinrec[2].split(',')]components=[families[comp]ifcompinfamilieselse[comp]forcompincomponents]forthis_componentsinitertools.product(*components):# some complex contains other complexesifany(comp.startswith('SIGNOR-C')forcompinthis_components):on_hold.append((rec[1],this_components,rec[0]))else:cplex=intera.Complex(name=rec[1].replace('"','').strip(),components=this_components,sources='SIGNOR',ids=rec[0],)complexes[cplex.__str__()]=cplexcomplexes_by_id[rec[0]].add(cplex)whileTrue:# complexes are defined recursivelycount_on_hold=len(on_hold)on_hold,complexes_by_id,complexes=(process_on_hold(on_hold,complexes_by_id,complexes))iflen(on_hold)==count_on_hold:breakreturncomplexes