Source code for pypath.internals.refs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from future.utils import iteritems
import os
import sys
import webbrowser
import pandas as pd
try:
import cPickle as pickle
except:
import pickle
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.resources.urls as urls
from pypath.inputs import pubmed as pubmed_input
import pypath.share.cache as cache
import pypath.inputs.pubmed as pubmed
[docs]
class Reference(object):
__slots__ = ['pmid']
[docs]
def __init__(self, pmid):
self.pmid = str(pmid).strip()
def __eq__(self, other):
return self.pmid == other.pmid
def __hash__(self):
return hash(self.pmid)
def open(self):
pubmed_input.open_pubmed(self.pmid)
def __str__(self):
return self.pmid
def info(self):
return pubmed_input.get_pubmeds([self.pmid])
def __repr__(self):
return '<Reference: %s>' % self.pmid
[docs]
def get_pubmed_data(
pp,
cachefile = None,
htp_threshold = 20
):
"""
For one PyPath object, obtains metadata for all PubMed IDs
through NCBI E-utils.
:param pp:
``pypath.PyPath`` object
:param htp_threshold:
The number of interactions for one reference
above the study considered to be high-throughput.
"""
if cachefile is None:
cachefile = cache.cache_item('pubmed_cache')
if htp_threshold is not None:
pp.htp_stats()
pubmeds = common.unique_list(
common.flat_list([[r.pmid for r in e['references']]
for e in pp.graph.es]))
if htp_threshold is not None:
pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs']
notpmid = [i for i in pubmeds if not i.isdigit()]
sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' %
len(notpmid))
pmdata = {}
if os.path.exists(cachefile):
sys.stdout.write('\t:: Loading data previously downloaded '
'from PubMed, from file `%s`\n' % cachefile)
pmdata = pickle.load(open(cachefile, 'rb'))
missing = list(set(pubmeds) - set(pmdata.keys()))
sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' %
len(missing))
cached_pubmeds_len = len(pmdata)
pmdata_new = pubmed_input.get_pubmeds(missing)
pmdata.update(pmdata_new)
sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile)
if len(pmdata) > cached_pubmeds_len:
pickle.dump(pmdata, open(cachefile, 'wb'))
pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds)
points = []
earliest = []
for e in pp.graph.es:
for s, rs in iteritems(e['refs_by_source']):
pms = [
r.pmid for r in rs
if (htp_threshold is None or r.pmid not in pp.htp[
htp_threshold]['htrefs']
) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid]
]
if len(pms) > 0:
yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms]
earliest.append((s, 0, min(yrs), '', e.index))
for pm in pms:
points.append((s, pm, int(pmdata[pm]['pubdate'][:4]),
pmdata[pm]['source'], e.index))
points = common.unique_list(points)
earliest = common.unique_list(earliest)
points = pd.DataFrame.from_records(points)
earliest = pd.DataFrame.from_records(earliest)
points.columns = ['database', 'pmid', 'year', 'journal', 'eid']
earliest.columns = ['database', 'none', 'year', 'none', 'eid']
return points, earliest