Source code for pypath.internals.refs

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#  This file is part of the `pypath` python module
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#  Website:

from future.utils import iteritems
import os
import sys
import webbrowser
import pandas as pd
    import cPickle as pickle
    import pickle

import pypath.share.curl as curl
import pypath.share.common as common
import pypath.resources.urls as urls
from pypath.inputs import pubmed as pubmed_input
import pypath.share.cache as cache
import pypath.inputs.pubmed as pubmed

[docs] class Reference(object): __slots__ = ['pmid']
[docs] def __init__(self, pmid): self.pmid = str(pmid).strip()
def __eq__(self, other): return self.pmid == other.pmid def __hash__(self): return hash(self.pmid) def open(self): pubmed_input.open_pubmed(self.pmid) def __str__(self): return self.pmid def info(self): return pubmed_input.get_pubmeds([self.pmid]) def __repr__(self): return '<Reference: %s>' % self.pmid
[docs] def get_pubmed_data( pp, cachefile = None, htp_threshold = 20 ): """ For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ if cachefile is None: cachefile = cache.cache_item('pubmed_cache') if htp_threshold is not None: pp.htp_stats() pubmeds = common.unique_list( common.flat_list([[r.pmid for r in e['references']] for e in])) if htp_threshold is not None: pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs'] notpmid = [i for i in pubmeds if not i.isdigit()] sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' % len(notpmid)) pmdata = {} if os.path.exists(cachefile): sys.stdout.write('\t:: Loading data previously downloaded ' 'from PubMed, from file `%s`\n' % cachefile) pmdata = pickle.load(open(cachefile, 'rb')) missing = list(set(pubmeds) - set(pmdata.keys())) sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' % len(missing)) cached_pubmeds_len = len(pmdata) pmdata_new = pubmed_input.get_pubmeds(missing) pmdata.update(pmdata_new) sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile) if len(pmdata) > cached_pubmeds_len: pickle.dump(pmdata, open(cachefile, 'wb')) pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds) points = [] earliest = [] for e in for s, rs in iteritems(e['refs_by_source']): pms = [ r.pmid for r in rs if (htp_threshold is None or r.pmid not in pp.htp[ htp_threshold]['htrefs'] ) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid] ] if len(pms) > 0: yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms] earliest.append((s, 0, min(yrs), '', e.index)) for pm in pms: points.append((s, pm, int(pmdata[pm]['pubdate'][:4]), pmdata[pm]['source'], e.index)) points = common.unique_list(points) earliest = common.unique_list(earliest) points = pd.DataFrame.from_records(points) earliest = pd.DataFrame.from_records(earliest) points.columns = ['database', 'pmid', 'year', 'journal', 'eid'] earliest.columns = ['database', 'none', 'year', 'none', 'eid'] return points, earliest