#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsimportosimportsysimportwebbrowserimportpandasaspdtry:importcPickleaspickleexcept:importpickleimportpypath.share.curlascurlimportpypath.share.commonascommonimportpypath.resources.urlsasurlsfrompypath.inputsimportpubmedaspubmed_inputimportpypath.share.cacheascacheimportpypath.inputs.pubmedaspubmed
[docs]defget_pubmed_data(pp,cachefile=None,htp_threshold=20):""" For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ifcachefileisNone:cachefile=cache.cache_item('pubmed_cache')ifhtp_thresholdisnotNone:pp.htp_stats()pubmeds=common.unique_list(common.flat_list([[r.pmidforrine['references']]foreinpp.graph.es]))ifhtp_thresholdisnotNone:pubmeds=set(pubmeds)-pp.htp[htp_threshold]['htrefs']notpmid=[iforiinpubmedsifnoti.isdigit()]sys.stdout.write('\t:: Number of non PubMed ID references: %u\n'%len(notpmid))pmdata={}ifos.path.exists(cachefile):sys.stdout.write('\t:: Loading data previously downloaded ''from PubMed, from file `%s`\n'%cachefile)pmdata=pickle.load(open(cachefile,'rb'))missing=list(set(pubmeds)-set(pmdata.keys()))sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n'%len(missing))cached_pubmeds_len=len(pmdata)pmdata_new=pubmed_input.get_pubmeds(missing)pmdata.update(pmdata_new)sys.stdout.write('\t:: Saving PubMed data to file `%s`\n'%cachefile)iflen(pmdata)>cached_pubmeds_len:pickle.dump(pmdata,open(cachefile,'wb'))pmdata=dict(iforiinpmdata.items()ifi[0]inpubmeds)points=[]earliest=[]foreinpp.graph.es:fors,rsiniteritems(e['refs_by_source']):pms=[r.pmidforrinrsif(htp_thresholdisNoneorr.pmidnotinpp.htp[htp_threshold]['htrefs'])andr.pmidinpmdataand'pubdate'inpmdata[r.pmid]]iflen(pms)>0:yrs=[int(pmdata[pm]['pubdate'][:4])forpminpms]earliest.append((s,0,min(yrs),'',e.index))forpminpms:points.append((s,pm,int(pmdata[pm]['pubdate'][:4]),pmdata[pm]['source'],e.index))points=common.unique_list(points)earliest=common.unique_list(earliest)points=pd.DataFrame.from_records(points)earliest=pd.DataFrame.from_records(earliest)points.columns=['database','pmid','year','journal','eid']earliest.columns=['database','none','year','none','eid']returnpoints,earliest