#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsimportosimportcollectionsimportbs4importpandasaspdimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.share.progressasprogressimportpypath.share.cacheascache_modimportpypath.share.sessionassession_logger=session.Logger(name='pepcyber_input')_log=_logger._log
[docs]defpepcyber_interactions(dataframe=False,cache=True):""" Downloads phosphoprotein binding protein interactions from the PepCyber database (http://www.pepcyber.org/). Args dataframe (bool): Return a pandas data frame instead of list of tuples. cache (bool): Read the data from an intermediate cache file, if available. """PepcyberInteraction=collections.namedtuple('PepcyberInteraction',('ppdb_class','ppdb_genesymbol','substrate_genesymbol','binding_seq','binding_pos','all_evidences','n_records','category','substrate_residue','ppdb_uniprot','ppdb_refseq','substrate_uniprot','substrate_refseq','evidence','pmid',),)defget_cells(row):cells=row.find_all('td')iflen(cells)==10:sp=cells[4].find('span')if(spisnotNoneand'class'insp.attrsand'sequence'insp.attrs['class']):returncellscachefile=os.path.join(cache_mod.get_cachedir(),'pepcyber_details.tsv',)ifcacheandos.path.exists(cachefile):_log('Reading data from cache file `%s`.'%cachefile)tbl=pd.read_csv(cachefile,sep='\t',dtype={'pmid':'string'})result=[PepcyberInteraction(*(fifpd.notna(f)elseNoneforfinrow))forrowintbl.itertuples(index=False)]else:_log('Downloading PepCyber data.')url=urls.urls['pepcyber']['rescued']# this is huge, takes a few minutes!c=curl.Curl(url,silent=False,timeout=600,encoding='iso-8859-1',)data=c.resultsoup=bs4.BeautifulSoup(data,'html.parser')rows=soup.find_all('tr')result=[]prg=progress.Progress(len(rows),'Retrieving and processing PepCyber data',7,)forrowinrows:prg.step()cells=get_cells(row)ifcellsisNone:continuerow_txt=[c.text.strip()forcincells]iflen(row_txt)>9androw_txt[5].isdigit():inum=int(row.find('a')['name'])row_txt[9]=(Noneif'p'notinrow_txt[4]elserow_txt[4][row_txt[4].index('p')+1])details=pepcyber_details(inum)row_txt.extend(details[row_txt[2]]ifrow_txt[2]indetailselse[None,None])row_txt.extend(details[row_txt[3]]ifrow_txt[3]indetailselse[None,None])refs=details['_refs']or[(None,)*3]forrefinrefs:this_record=row_txt[1:]+list(ref[1:])this_record[4]=int(this_record[4])this_record[6]=int(this_record[6])result.append(PepcyberInteraction(*this_record))tbl=pd.DataFrame.from_records(result,columns=PepcyberInteraction._fields,)_log('Saving data to `%s`.'%cachefile)tbl.to_csv(cachefile,sep='\t',index=False)returntblifdataframeelseresult
[docs]defpepcyber_details(num):""" Retrieves detailed information about an interaction from the PepCyber database. Returns Dict with gene symbols as keys and lists of length 2 as values, with UniProt ID and RefSeq protein ID. A special key `_refs` holds a list of dictionaries, each with category, evidence type and PubMed reference information. """PepcyberReference=collections.namedtuple('PepcyberReference',('category','evidence','reference'))result={'_refs':[]}url=urls.urls['pepcyber']['details_rescued']%numc=curl.Curl(url,encoding='iso-8859-1')data=c.resultifdata:soup=bs4.BeautifulSoup(data,'html.parser')gname=Noneprev=''fortdinsoup.find_all('td'):ifprev.startswith('Gene name'):gname=td.text.strip().split('(')[0]ifprev.startswith('RefSeq'):refseq=td.text.strip()ifprev.startswith('SwissProt')andgnameisnotNone:swprot=td.text.strip()ifgnameandgname[0]!=u'\xce':result[gname]=[swprot,refseq]gname=Noneprev=td.text.strip()ifsoup.find(text='Records:'):refs=(soup.find(text='Records:').parent.parent.parent.next_sibling.find('table').find_all('tr'))[1:]result['_refs']=[PepcyberReference(*(td.a.a.textiftd.find('a')elsetd.textfortdintr.find_all('td')))fortrinrefs]returnresult