{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The pypath book"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Introduction\" data-toc-modified-id=\"Introduction-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href=\"#Build,-load-and-save-databases\" data-toc-modified-id=\"Build,-load-and-save-databases-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Build, load and save databases</a></span><ul class=\"toc-item\"><li><span><a href=\"#The-OmniPath-app\" data-toc-modified-id=\"The-OmniPath-app-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>The OmniPath app</a></span></li><li><span><a href=\"#Built-in-database-definitions\" data-toc-modified-id=\"Built-in-database-definitions-2.2\"><span class=\"toc-item-num\">2.2&nbsp;&nbsp;</span>Built-in database definitions</a></span></li><li><span><a href=\"#Networks\" data-toc-modified-id=\"Networks-2.3\"><span class=\"toc-item-num\">2.3&nbsp;&nbsp;</span>Networks</a></span><ul class=\"toc-item\"><li><span><a href=\"#Strictly-literature-curated-network\" data-toc-modified-id=\"Strictly-literature-curated-network-2.3.1\"><span class=\"toc-item-num\">2.3.1&nbsp;&nbsp;</span>Strictly literature curated network</a></span></li><li><span><a href=\"#The-OmniPath-network-with-extra-activity-flow,-enzyme-substrate-and-ligand-receptor-interactions\" data-toc-modified-id=\"The-OmniPath-network-with-extra-activity-flow,-enzyme-substrate-and-ligand-receptor-interactions-2.3.2\"><span class=\"toc-item-num\">2.3.2&nbsp;&nbsp;</span>The OmniPath network with extra activity flow, enzyme-substrate and ligand-receptor interactions</a></span></li><li><span><a href=\"#Transcriptional-regulation-network-from-DoRothEA-and-other-resources\" data-toc-modified-id=\"Transcriptional-regulation-network-from-DoRothEA-and-other-resources-2.3.3\"><span class=\"toc-item-num\">2.3.3&nbsp;&nbsp;</span>Transcriptional regulation network from DoRothEA and other resources</a></span></li><li><span><a href=\"#Literature-curated-miRNA-post-transcriptional-regulation-network\" data-toc-modified-id=\"Literature-curated-miRNA-post-transcriptional-regulation-network-2.3.4\"><span class=\"toc-item-num\">2.3.4&nbsp;&nbsp;</span>Literature curated miRNA post-transcriptional regulation network</a></span></li><li><span><a href=\"#Transcriptional-regulation-of-miRNA\" data-toc-modified-id=\"Transcriptional-regulation-of-miRNA-2.3.5\"><span class=\"toc-item-num\">2.3.5&nbsp;&nbsp;</span>Transcriptional regulation of miRNA</a></span></li><li><span><a href=\"#lncRNA-mRNA-interactions\" data-toc-modified-id=\"lncRNA-mRNA-interactions-2.3.6\"><span class=\"toc-item-num\">2.3.6&nbsp;&nbsp;</span>lncRNA-mRNA interactions</a></span></li><li><span><a href=\"#Small-molecule-protein-interactions\" data-toc-modified-id=\"Small-molecule-protein-interactions-2.3.7\"><span class=\"toc-item-num\">2.3.7&nbsp;&nbsp;</span>Small molecule-protein interactions</a></span></li></ul></li><li><span><a href=\"#Enzyme-substrate-relationships\" data-toc-modified-id=\"Enzyme-substrate-relationships-2.4\"><span class=\"toc-item-num\">2.4&nbsp;&nbsp;</span>Enzyme-substrate relationships</a></span></li><li><span><a href=\"#Protein-complexes\" data-toc-modified-id=\"Protein-complexes-2.5\"><span class=\"toc-item-num\">2.5&nbsp;&nbsp;</span>Protein complexes</a></span></li><li><span><a href=\"#Annotations\" data-toc-modified-id=\"Annotations-2.6\"><span class=\"toc-item-num\">2.6&nbsp;&nbsp;</span>Annotations</a></span></li><li><span><a href=\"#Inter-cellular-communication-roles\" data-toc-modified-id=\"Inter-cellular-communication-roles-2.7\"><span class=\"toc-item-num\">2.7&nbsp;&nbsp;</span>Inter-cellular communication roles</a></span></li></ul></li><li><span><a href=\"#Data-directly-from-the-original-resources\" data-toc-modified-id=\"Data-directly-from-the-original-resources-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Data directly from the original resources</a></span></li><li><span><a href=\"#Interesting-resources\" data-toc-modified-id=\"Interesting-resources-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Interesting resources</a></span><ul class=\"toc-item\"><li><span><a href=\"#RaMP\" data-toc-modified-id=\"RaMP-4.1\"><span class=\"toc-item-num\">4.1&nbsp;&nbsp;</span>RaMP</a></span><ul class=\"toc-item\"><li><span><a href=\"#TL;DR\" data-toc-modified-id=\"TL;DR-4.1.1\"><span class=\"toc-item-num\">4.1.1&nbsp;&nbsp;</span>TL;DR</a></span></li></ul></li><li><span><a href=\"#HMDB-(Human-Metabolome-Database)\" data-toc-modified-id=\"HMDB-(Human-Metabolome-Database)-4.2\"><span class=\"toc-item-num\">4.2&nbsp;&nbsp;</span>HMDB (Human Metabolome Database)</a></span><ul class=\"toc-item\"><li><span><a href=\"#Direct-access-to-HMDB-data\" data-toc-modified-id=\"Direct-access-to-HMDB-data-4.2.1\"><span class=\"toc-item-num\">4.2.1&nbsp;&nbsp;</span>Direct access to HMDB data</a></span></li><li><span><a href=\"#Higher-level-access-to-HMDB-data\" data-toc-modified-id=\"Higher-level-access-to-HMDB-data-4.2.2\"><span class=\"toc-item-num\">4.2.2&nbsp;&nbsp;</span>Higher level access to HMDB data</a></span></li><li><span><a href=\"#ID-translation-with-HMDB\" data-toc-modified-id=\"ID-translation-with-HMDB-4.2.3\"><span class=\"toc-item-num\">4.2.3&nbsp;&nbsp;</span>ID translation with HMDB</a></span></li></ul></li><li><span><a href=\"#NCBI-E-Utils\" data-toc-modified-id=\"NCBI-E-Utils-4.3\"><span class=\"toc-item-num\">4.3&nbsp;&nbsp;</span>NCBI E-Utils</a></span></li></ul></li><li><span><a href=\"#Download-management\" data-toc-modified-id=\"Download-management-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Download management</a></span><ul class=\"toc-item\"><li><span><a href=\"#Cache-management-and-customization\" data-toc-modified-id=\"Cache-management-and-customization-5.1\"><span class=\"toc-item-num\">5.1&nbsp;&nbsp;</span>Cache management and customization</a></span></li><li><span><a href=\"#Download-failures\" data-toc-modified-id=\"Download-failures-5.2\"><span class=\"toc-item-num\">5.2&nbsp;&nbsp;</span>Download failures</a></span><ul class=\"toc-item\"><li><span><a href=\"#Corrupted-cache-content\" data-toc-modified-id=\"Corrupted-cache-content-5.2.1\"><span class=\"toc-item-num\">5.2.1&nbsp;&nbsp;</span>Corrupted cache content</a></span></li><li><span><a href=\"#Network-communication-issues:-look-into-the-curl-debug-log\" data-toc-modified-id=\"Network-communication-issues:-look-into-the-curl-debug-log-5.2.2\"><span class=\"toc-item-num\">5.2.2&nbsp;&nbsp;</span>Network communication issues: look into the curl debug log</a></span></li><li><span><a href=\"#Timeouts\" data-toc-modified-id=\"Timeouts-5.2.3\"><span class=\"toc-item-num\">5.2.3&nbsp;&nbsp;</span>Timeouts</a></span></li><li><span><a href=\"#Access-and-inspect-the-Curl-object\" data-toc-modified-id=\"Access-and-inspect-the-Curl-object-5.2.4\"><span class=\"toc-item-num\">5.2.4&nbsp;&nbsp;</span>Access and inspect the <code>Curl</code> object</a></span></li><li><span><a href=\"#Is-it-failing-only-for-you?\" data-toc-modified-id=\"Is-it-failing-only-for-you?-5.2.5\"><span class=\"toc-item-num\">5.2.5&nbsp;&nbsp;</span>Is it failing only for you?</a></span></li><li><span><a href=\"#Read-the-log\" data-toc-modified-id=\"Read-the-log-5.2.6\"><span class=\"toc-item-num\">5.2.6&nbsp;&nbsp;</span>Read the log</a></span></li><li><span><a href=\"#TLS-(SSL,-HTTPS)-errors\" data-toc-modified-id=\"TLS-(SSL,-HTTPS)-errors-5.2.7\"><span class=\"toc-item-num\">5.2.7&nbsp;&nbsp;</span>TLS (SSL, HTTPS) errors</a></span></li></ul></li></ul></li><li><span><a href=\"#Resources\" data-toc-modified-id=\"Resources-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Resources</a></span><ul class=\"toc-item\"><li><span><a href=\"#Licenses\" data-toc-modified-id=\"Licenses-6.1\"><span class=\"toc-item-num\">6.1&nbsp;&nbsp;</span>Licenses</a></span><ul class=\"toc-item\"><li><span><a href=\"#Example:-build-a-network-for-commercial-use\" data-toc-modified-id=\"Example:-build-a-network-for-commercial-use-6.1.1\"><span class=\"toc-item-num\">6.1.1&nbsp;&nbsp;</span>Example: build a network for commercial use</a></span></li></ul></li><li><span><a href=\"#Resource-information\" data-toc-modified-id=\"Resource-information-6.2\"><span class=\"toc-item-num\">6.2&nbsp;&nbsp;</span>Resource information</a></span></li><li><span><a href=\"#Resource-definitions-for-a-certain-database-or-dataset\" data-toc-modified-id=\"Resource-definitions-for-a-certain-database-or-dataset-6.3\"><span class=\"toc-item-num\">6.3&nbsp;&nbsp;</span>Resource definitions for a certain database or dataset</a></span></li></ul></li><li><span><a href=\"#Building-networks-\" data-toc-modified-id=\"Building-networks--7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Building networks <a class=\"anchor\" id=\"building-networks\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Which-network-datasets-are-pre-defined-in-pypath?-\" data-toc-modified-id=\"Which-network-datasets-are-pre-defined-in-pypath?--7.1\"><span class=\"toc-item-num\">7.1&nbsp;&nbsp;</span>Which network datasets are pre-defined in pypath? <a class=\"anchor\" id=\"network-resources\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#The-Network-object\" data-toc-modified-id=\"The-Network-object-7.2\"><span class=\"toc-item-num\">7.2&nbsp;&nbsp;</span>The <code>Network</code> object</a></span></li><li><span><a href=\"#Network-in-pandas.DataFrame-\" data-toc-modified-id=\"Network-in-pandas.DataFrame--7.3\"><span class=\"toc-item-num\">7.3&nbsp;&nbsp;</span>Network in <em>pandas.DataFrame</em> <a class=\"anchor\" id=\"network-pandas\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Self-interactions-(loop-edges)-in-the-network\" data-toc-modified-id=\"Self-interactions-(loop-edges)-in-the-network-7.4\"><span class=\"toc-item-num\">7.4&nbsp;&nbsp;</span>Self interactions (loop edges) in the network</a></span></li><li><span><a href=\"#Molecular-complexes-in-the-network\" data-toc-modified-id=\"Molecular-complexes-in-the-network-7.5\"><span class=\"toc-item-num\">7.5&nbsp;&nbsp;</span>Molecular complexes in the network</a></span></li></ul></li><li><span><a href=\"#Translating-identifiers-\" data-toc-modified-id=\"Translating-identifiers--8\"><span class=\"toc-item-num\">8&nbsp;&nbsp;</span>Translating identifiers <a class=\"anchor\" id=\"mapping\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Pre-defined-ID-translation-tables\" data-toc-modified-id=\"Pre-defined-ID-translation-tables-8.1\"><span class=\"toc-item-num\">8.1&nbsp;&nbsp;</span>Pre-defined ID translation tables</a></span></li><li><span><a href=\"#Direct-access-to-ID-translation-tables\" data-toc-modified-id=\"Direct-access-to-ID-translation-tables-8.2\"><span class=\"toc-item-num\">8.2&nbsp;&nbsp;</span>Direct access to ID translation tables</a></span></li></ul></li><li><span><a href=\"#Orthology-translation\" data-toc-modified-id=\"Orthology-translation-9\"><span class=\"toc-item-num\">9&nbsp;&nbsp;</span>Orthology translation</a></span><ul class=\"toc-item\"><li><span><a href=\"#Orthology-translation-tables-as-dictionaries\" data-toc-modified-id=\"Orthology-translation-tables-as-dictionaries-9.1\"><span class=\"toc-item-num\">9.1&nbsp;&nbsp;</span>Orthology translation tables as dictionaries</a></span></li><li><span><a href=\"#Orthology-translation-data-frames\" data-toc-modified-id=\"Orthology-translation-data-frames-9.2\"><span class=\"toc-item-num\">9.2&nbsp;&nbsp;</span>Orthology translation data frames</a></span></li></ul></li><li><span><a href=\"#Taxonomy\" data-toc-modified-id=\"Taxonomy-10\"><span class=\"toc-item-num\">10&nbsp;&nbsp;</span>Taxonomy</a></span><ul class=\"toc-item\"><li><span><a href=\"#Translating-to-NCBI-Taxonomy,-scientific-names-and-common-names\" data-toc-modified-id=\"Translating-to-NCBI-Taxonomy,-scientific-names-and-common-names-10.1\"><span class=\"toc-item-num\">10.1&nbsp;&nbsp;</span>Translating to NCBI Taxonomy, scientific names and common names</a></span></li><li><span><a href=\"#Organism-from-UniProt-ID\" data-toc-modified-id=\"Organism-from-UniProt-ID-10.2\"><span class=\"toc-item-num\">10.2&nbsp;&nbsp;</span>Organism from UniProt ID</a></span></li></ul></li><li><span><a href=\"#UniProt\" data-toc-modified-id=\"UniProt-11\"><span class=\"toc-item-num\">11&nbsp;&nbsp;</span>UniProt</a></span><ul class=\"toc-item\"><li><span><a href=\"#The-UniProt-input-module\" data-toc-modified-id=\"The-UniProt-input-module-11.1\"><span class=\"toc-item-num\">11.1&nbsp;&nbsp;</span>The UniProt input module</a></span><ul class=\"toc-item\"><li><span><a href=\"#All-UniProt-IDs-for-one-organism\" data-toc-modified-id=\"All-UniProt-IDs-for-one-organism-11.1.1\"><span class=\"toc-item-num\">11.1.1&nbsp;&nbsp;</span>All UniProt IDs for one organism</a></span></li><li><span><a href=\"#UniProt-ID-format-validation\" data-toc-modified-id=\"UniProt-ID-format-validation-11.1.2\"><span class=\"toc-item-num\">11.1.2&nbsp;&nbsp;</span>UniProt ID format validation</a></span></li><li><span><a href=\"#UniProt-ID-validation\" data-toc-modified-id=\"UniProt-ID-validation-11.1.3\"><span class=\"toc-item-num\">11.1.3&nbsp;&nbsp;</span>UniProt ID validation</a></span></li><li><span><a href=\"#Single-UniProt-protein-datasheet\" data-toc-modified-id=\"Single-UniProt-protein-datasheet-11.1.4\"><span class=\"toc-item-num\">11.1.4&nbsp;&nbsp;</span>Single UniProt protein datasheet</a></span></li><li><span><a href=\"#History-of-UniProt-records\" data-toc-modified-id=\"History-of-UniProt-records-11.1.5\"><span class=\"toc-item-num\">11.1.5&nbsp;&nbsp;</span>History of UniProt records</a></span></li><li><span><a href=\"#UniProt-REST-API\" data-toc-modified-id=\"UniProt-REST-API-11.1.6\"><span class=\"toc-item-num\">11.1.6&nbsp;&nbsp;</span>UniProt REST API</a></span></li><li><span><a href=\"#Processed-UniProt-annotations\" data-toc-modified-id=\"Processed-UniProt-annotations-11.1.7\"><span class=\"toc-item-num\">11.1.7&nbsp;&nbsp;</span>Processed UniProt annotations</a></span></li></ul></li><li><span><a href=\"#The-UniProt-utils-module\" data-toc-modified-id=\"The-UniProt-utils-module-11.2\"><span class=\"toc-item-num\">11.2&nbsp;&nbsp;</span>The UniProt utils module</a></span><ul class=\"toc-item\"><li><span><a href=\"#Datasheets\" data-toc-modified-id=\"Datasheets-11.2.1\"><span class=\"toc-item-num\">11.2.1&nbsp;&nbsp;</span>Datasheets</a></span></li><li><span><a href=\"#Tables\" data-toc-modified-id=\"Tables-11.2.2\"><span class=\"toc-item-num\">11.2.2&nbsp;&nbsp;</span>Tables</a></span></li></ul></li><li><span><a href=\"#Sanitizing-UniProt-IDs\" data-toc-modified-id=\"Sanitizing-UniProt-IDs-11.3\"><span class=\"toc-item-num\">11.3&nbsp;&nbsp;</span>Sanitizing UniProt IDs</a></span></li></ul></li><li><span><a href=\"#Enzyme-substrate-interactions-\" data-toc-modified-id=\"Enzyme-substrate-interactions--12\"><span class=\"toc-item-num\">12&nbsp;&nbsp;</span>Enzyme-substrate interactions <a class=\"anchor\" id=\"enz-sub\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Enzyme-substrate-objects\" data-toc-modified-id=\"Enzyme-substrate-objects-12.1\"><span class=\"toc-item-num\">12.1&nbsp;&nbsp;</span>Enzyme-substrate objects</a></span></li><li><span><a href=\"#Enzyme-substrate-data-frame\" data-toc-modified-id=\"Enzyme-substrate-data-frame-12.2\"><span class=\"toc-item-num\">12.2&nbsp;&nbsp;</span>Enzyme-substrate data frame</a></span></li></ul></li><li><span><a href=\"#Protein-sequences\" data-toc-modified-id=\"Protein-sequences-13\"><span class=\"toc-item-num\">13&nbsp;&nbsp;</span>Protein sequences</a></span></li><li><span><a href=\"#Annotations-\" data-toc-modified-id=\"Annotations--14\"><span class=\"toc-item-num\">14&nbsp;&nbsp;</span>Annotations <a class=\"anchor\" id=\"annotations\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Load-a-single-annotation-resource\" data-toc-modified-id=\"Load-a-single-annotation-resource-14.1\"><span class=\"toc-item-num\">14.1&nbsp;&nbsp;</span>Load a single annotation resource</a></span></li><li><span><a href=\"#Load-the-full-annotations-database-by-the-database-manager\" data-toc-modified-id=\"Load-the-full-annotations-database-by-the-database-manager-14.2\"><span class=\"toc-item-num\">14.2&nbsp;&nbsp;</span>Load the full annotations database by the database manager</a></span></li><li><span><a href=\"#Load-only-selected-annotations\" data-toc-modified-id=\"Load-only-selected-annotations-14.3\"><span class=\"toc-item-num\">14.3&nbsp;&nbsp;</span>Load only selected annotations</a></span></li><li><span><a href=\"#Data-frames-of-annotations\" data-toc-modified-id=\"Data-frames-of-annotations-14.4\"><span class=\"toc-item-num\">14.4&nbsp;&nbsp;</span>Data frames of annotations</a></span></li></ul></li><li><span><a href=\"#Inter-cellular-signaling-roles-\" data-toc-modified-id=\"Inter-cellular-signaling-roles--15\"><span class=\"toc-item-num\">15&nbsp;&nbsp;</span>Inter-cellular signaling roles <a class=\"anchor\" id=\"intercell\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Build-an-intercellular-communication-network\" data-toc-modified-id=\"Build-an-intercellular-communication-network-15.1\"><span class=\"toc-item-num\">15.1&nbsp;&nbsp;</span>Build an intercellular communication network</a></span></li><li><span><a href=\"#Quantitative-overview-of-intercell-annotations\" data-toc-modified-id=\"Quantitative-overview-of-intercell-annotations-15.2\"><span class=\"toc-item-num\">15.2&nbsp;&nbsp;</span>Quantitative overview of intercell annotations</a></span></li><li><span><a href=\"#Intercell-database-as-data-frame\" data-toc-modified-id=\"Intercell-database-as-data-frame-15.3\"><span class=\"toc-item-num\">15.3&nbsp;&nbsp;</span>Intercell database as data frame</a></span></li><li><span><a href=\"#Browse-intercell-categories\" data-toc-modified-id=\"Browse-intercell-categories-15.4\"><span class=\"toc-item-num\">15.4&nbsp;&nbsp;</span>Browse intercell categories</a></span></li></ul></li><li><span><a href=\"#Gene-Ontology-\" data-toc-modified-id=\"Gene-Ontology--16\"><span class=\"toc-item-num\">16&nbsp;&nbsp;</span>Gene Ontology <a class=\"anchor\" id=\"gene-ontology\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Protein-complexes-\" data-toc-modified-id=\"Protein-complexes--17\"><span class=\"toc-item-num\">17&nbsp;&nbsp;</span>Protein complexes <a class=\"anchor\" id=\"complexes\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Protein-complex-objects\" data-toc-modified-id=\"Protein-complex-objects-17.1\"><span class=\"toc-item-num\">17.1&nbsp;&nbsp;</span>Protein complex objects</a></span></li><li><span><a href=\"#Protein-complex-data-frame\" data-toc-modified-id=\"Protein-complex-data-frame-17.2\"><span class=\"toc-item-num\">17.2&nbsp;&nbsp;</span>Protein complex data frame</a></span></li></ul></li><li><span><a href=\"#Saving-datasets-as-pickles-\" data-toc-modified-id=\"Saving-datasets-as-pickles--18\"><span class=\"toc-item-num\">18&nbsp;&nbsp;</span>Saving datasets as pickles <a class=\"anchor\" id=\"pickle\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Log-messages-and-sessions-\" data-toc-modified-id=\"Log-messages-and-sessions--19\"><span class=\"toc-item-num\">19&nbsp;&nbsp;</span>Log messages and sessions <a class=\"anchor\" id=\"log-session\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Basic-info-about-the-session\" data-toc-modified-id=\"Basic-info-about-the-session-19.1\"><span class=\"toc-item-num\">19.1&nbsp;&nbsp;</span>Basic info about the session</a></span></li><li><span><a href=\"#Read-the-log-file\" data-toc-modified-id=\"Read-the-log-file-19.2\"><span class=\"toc-item-num\">19.2&nbsp;&nbsp;</span>Read the log file</a></span></li><li><span><a href=\"#Logging-to-the-console\" data-toc-modified-id=\"Logging-to-the-console-19.3\"><span class=\"toc-item-num\">19.3&nbsp;&nbsp;</span>Logging to the console</a></span></li><li><span><a href=\"#Disable-logging\" data-toc-modified-id=\"Disable-logging-19.4\"><span class=\"toc-item-num\">19.4&nbsp;&nbsp;</span>Disable logging</a></span></li><li><span><a href=\"#Write-to-the-log\" data-toc-modified-id=\"Write-to-the-log-19.5\"><span class=\"toc-item-num\">19.5&nbsp;&nbsp;</span>Write to the log</a></span><ul class=\"toc-item\"><li><span><a href=\"#Sending-a-single-message\" data-toc-modified-id=\"Sending-a-single-message-19.5.1\"><span class=\"toc-item-num\">19.5.1&nbsp;&nbsp;</span>Sending a single message</a></span></li><li><span><a href=\"#Connect-a-module-or-class-to-the-pypath-logger\" data-toc-modified-id=\"Connect-a-module-or-class-to-the-pypath-logger-19.5.2\"><span class=\"toc-item-num\">19.5.2&nbsp;&nbsp;</span>Connect a module or class to the pypath logger</a></span></li></ul></li></ul></li><li><span><a href=\"#BEL-export-\" data-toc-modified-id=\"BEL-export--20\"><span class=\"toc-item-num\">20&nbsp;&nbsp;</span>BEL export <a class=\"anchor\" id=\"bel\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#CellPhoneDB-export-\" data-toc-modified-id=\"CellPhoneDB-export--21\"><span class=\"toc-item-num\">21&nbsp;&nbsp;</span>CellPhoneDB export <a class=\"anchor\" id=\"cellphonedb\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#The-legacy-igraph-based-network-object-\" data-toc-modified-id=\"The-legacy-igraph-based-network-object--22\"><span class=\"toc-item-num\">22&nbsp;&nbsp;</span>The legacy <em>igraph</em>-based network object <a class=\"anchor\" id=\"legacy\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#I-just-want-a-network-quickly-and-play-around-with-pypath-\" data-toc-modified-id=\"I-just-want-a-network-quickly-and-play-around-with-pypath--22.1\"><span class=\"toc-item-num\">22.1&nbsp;&nbsp;</span>I just want a network quickly and play around with <em>pypath</em> <a class=\"anchor\" id=\"legacy-quick-start\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#How-do-I-build-networks-from-any-data-with-pypath?-\" data-toc-modified-id=\"How-do-I-build-networks-from-any-data-with-pypath?--22.2\"><span class=\"toc-item-num\">22.2&nbsp;&nbsp;</span>How do I build networks from any data with <em>pypath</em>? <a class=\"anchor\" id=\"legacy-quick-start-2\" rel=\"nofollow\"></a></a></span><ul class=\"toc-item\"><li><span><a href=\"#Defining-input-formats-\" data-toc-modified-id=\"Defining-input-formats--22.2.1\"><span class=\"toc-item-num\">22.2.1&nbsp;&nbsp;</span>Defining input formats <a class=\"anchor\" id=\"input-formats\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Creating-PyPath-object-and-loading-the-2-test-files-\" data-toc-modified-id=\"Creating-PyPath-object-and-loading-the-2-test-files--22.2.2\"><span class=\"toc-item-num\">22.2.2&nbsp;&nbsp;</span>Creating PyPath object and loading the 2 test files <a class=\"anchor\" id=\"toy-example\" rel=\"nofollow\"></a></a></span></li></ul></li><li><span><a href=\"#Structure-of-the-legacy-network-object\" data-toc-modified-id=\"Structure-of-the-legacy-network-object-22.3\"><span class=\"toc-item-num\">22.3&nbsp;&nbsp;</span>Structure of the legacy network object</a></span><ul class=\"toc-item\"><li><span><a href=\"#Directions-and-signs-\" data-toc-modified-id=\"Directions-and-signs--22.3.1\"><span class=\"toc-item-num\">22.3.1&nbsp;&nbsp;</span>Directions and signs <a class=\"anchor\" id=\"directions\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Accessing-nodes-in-the-network-\" data-toc-modified-id=\"Accessing-nodes-in-the-network--22.3.2\"><span class=\"toc-item-num\">22.3.2&nbsp;&nbsp;</span>Accessing nodes in the network <a class=\"anchor\" id=\"nodes\" rel=\"nofollow\"></a></a></span></li></ul></li><li><span><a href=\"#Querying-relationships-with-our-without-causality-\" data-toc-modified-id=\"Querying-relationships-with-our-without-causality--22.4\"><span class=\"toc-item-num\">22.4&nbsp;&nbsp;</span>Querying relationships with our without causality <a class=\"anchor\" id=\"causality\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Accessing-edges-by-identifiers-\" data-toc-modified-id=\"Accessing-edges-by-identifiers--22.5\"><span class=\"toc-item-num\">22.5&nbsp;&nbsp;</span>Accessing edges by identifiers <a class=\"anchor\" id=\"edge-lookup\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Literature-references-\" data-toc-modified-id=\"Literature-references--22.6\"><span class=\"toc-item-num\">22.6&nbsp;&nbsp;</span>Literature references <a class=\"anchor\" id=\"references\" rel=\"nofollow\"></a></a></span></li><li><span><a href=\"#Plotting-the-network-with-igraph-\" data-toc-modified-id=\"Plotting-the-network-with-igraph--22.7\"><span class=\"toc-item-num\">22.7&nbsp;&nbsp;</span>Plotting the network with <em>igraph</em> <a class=\"anchor\" id=\"plot\" rel=\"nofollow\"></a></a></span></li></ul></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "OmniPath consists of 5 main database segments: network *(interactions)*, enzyme-substrate interactions *(enz_sub or ptms)*, protein complexes *(complexes)*, molecular entity annotations *(annotations)* and intercellular communication roles *(intercell)*. You can access all these by the web service at https://omnipathdb.org/ and the <a href=\"https://r.omnipathdb.org/\">R/Bioconductor package *OmnipathR*</a>, furthermore the network and some of the annotations by the <a href=\"http://apps.cytoscape.org/apps/omnipath\">Cytoscape app</a>. However only *pypath* is able to build these databases directly from the original sources with various options for customization and to provide a rich and versatile API for each database enjoying the almost unlimited flexibility of Python. This book attempts to be a guided tour around *pypath*, however almost all objects, modules, APIs presented here have many more methods, options and features than we have a chance to cover. If you feel like there might be something useful for you, don't hesitate to ask us by [github](https://github.com/saezlab/pypath/issues). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This document has been run with the following *pypath* version:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T16:49:47.104939Z",
     "start_time": "2023-03-09T16:49:47.057801Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.14.36'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pypath\n",
    "pypath.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build, load and save databases"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We provide a high level interface in the module *pypath.omnipath.app*. This is the easiest way to build, manage and access the OmniPath databases, hence this is what we present in the *Quick start* section. In further sections we show the lower level modules more in detail."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The OmniPath app"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*pypath.omnipath* is an application which contains a database manager at *omnipath.db*. This manager is empty by default. It builds and loads the databases on demand. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:11:27.592956Z",
     "start_time": "2022-12-03T14:11:26.254734Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pypath.omnipath.app.DatabaseManager at 0x602fb851cd90>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "\n",
    "omnipath.db"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Built-in database definitions\n",
    "\n",
    "The databases presented below are [pre-defined in pypath](https://github.com/saezlab/pypath/blob/master/pypath/omnipath/databases/builtins.json). You can also list them by:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:11:32.347480Z",
     "start_time": "2022-12-03T14:11:32.343892Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['omnipath',\n",
       " 'curated',\n",
       " 'complex',\n",
       " 'annotations',\n",
       " 'intercell',\n",
       " 'tf_target',\n",
       " 'dorothea',\n",
       " 'small_molecule',\n",
       " 'tf_mirna',\n",
       " 'mirna_mrna',\n",
       " 'lncrna_mrna',\n",
       " 'enz_sub']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "omnipath.db.datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Networks\n",
    "<a id=\"nw-dbmanager\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "OmniPath offers multiple built in network datasets: the OmniPath PPI network the more strict literature curated PPI network, the special ligand-receptor PPI network and various other PPI datasets, the transcriptional regulation network from DoRothEA and other resources, miRNA post-transcriptional regulation network and also transcriptional regulation network for miRNAs."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Strictly literature curated network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:17:13.442238Z",
     "start_time": "2022-12-02T13:16:56.607485Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 7980 nodes, 35551 interactions>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "cu = omnipath.db.get_db('curated')\n",
    "cu"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The OmniPath network with extra activity flow, enzyme-substrate and ligand-receptor interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:18:55.433036Z",
     "start_time": "2022-12-02T13:17:55.113111Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 18558 nodes, 94358 interactions>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "op = omnipath.db.get_db('omnipath')\n",
    "op"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Transcriptional regulation network from DoRothEA and other resources"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: according to the default settings, DoRothEA confidence levels A-D and all original resources will be loaded. To load only DoRothEA, use the key `\"dorothea\"` instead of `\"tf_target\"`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:21:54.401697Z",
     "start_time": "2022-12-02T13:19:41.686583Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 18986 nodes, 326708 interactions>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "tft = omnipath.db.get_db('tf_target')\n",
    "tft"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Literature curated miRNA post-transcriptional regulation network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:31:55.086766Z",
     "start_time": "2022-12-02T13:31:52.810485Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 1264 nodes, 3288 interactions>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "mi = omnipath.db.get_db('mirna_mrna')\n",
    "mi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Transcriptional regulation of miRNA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:32:41.408391Z",
     "start_time": "2022-12-02T13:32:41.403314Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 1032 nodes, 4960 interactions>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "tmi = omnipath.db.get_db('tf_mirna')\n",
    "tmi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### lncRNA-mRNA interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:33:03.630975Z",
     "start_time": "2022-12-02T13:33:03.627353Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 243 nodes, 217 interactions>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "lnc = omnipath.db.get_db('lncrna_mrna')\n",
    "lnc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Small molecule-protein interactions\n",
    "\n",
    "These interactions are either ligand-receptor connections, enzyme inhibitions, allosteric regulations or enzyme-metabolite interactions. Currently it is a small, experimental dataset, but will be largely extended in the future."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:57:17.189127Z",
     "start_time": "2022-12-02T13:57:09.252382Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 1980 nodes, 3147 interactions>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "smol = omnipath.db.get_db('small_molecule')\n",
    "smol"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Enzyme-substrate relationships\n",
    "<a id=\"es-dbmanager\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:33:26.176882Z",
     "start_time": "2022-12-02T13:33:20.037879Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Enzyme-substrate database: 41426 relationships>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "es = omnipath.db.get_db('enz_sub')\n",
    "es"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Protein complexes\n",
    "<a id=\"co-dbmanager\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:33:31.101976Z",
     "start_time": "2022-12-02T13:33:30.950463Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Complex database: 28173 complexes>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "co = omnipath.db.get_db('complex')\n",
    "co"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Annotations\n",
    "\n",
    "<a id=\"an-dbmanager\"></a>\n",
    "The annotations database is huge, building or even loading it takes long time and requires quite some memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:36:28.124454Z",
     "start_time": "2022-12-02T13:33:44.524615Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Annotation database: 5490653 records about 50872 entities from 68 resources>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "an = omnipath.db.get_db('annotations')\n",
    "an"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inter-cellular communication roles\n",
    "\n",
    "<a id=\"ic-dbmanager\"></a>\n",
    "This database is quick to build, but it requires the *annotations* database, which is really heavy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:37:12.358451Z",
     "start_time": "2022-12-02T13:36:49.019034Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Intercell annotations: 301527 records about 48570 entities>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "ic = omnipath.db.get_db('intercell')\n",
    "ic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data directly from the original resources\n",
    "\n",
    "The `pypath.inputs` module contains clients for more than 150 molecular biology and biomedical resources, and overall almost 500 functions that download data directly from these resources. Maintaining such a large number of clients is troublesome, hence at any time some of them are broken, you can check them in our [daily status report](https://status.omnipathdb.org/). Each submodule of `pypath.inputs` is named after its corresponding resource, all lowercase, e.g. \"depod\" *(DEPOD)* or \"cytosig\" *(CytoSig)*. Within these modules each function name starts with the name of the resource, and ends with the kind of data it retrieves. For example, `pypath.inputs.signor.signor_interactions` downloads interactions from *SIGNOR.* The labels *\"_interactions\"*, *\"_enz_sub\"*, *\"_complexes\"* and *\"_annotations\"* retrieve records intended to these respective databases. However, the records at this stage are not fully processed yet. Some functions have different postfixes, e.g. *\"_raw\"* means the data is close to the format provided by the resource itself; *\"_mapping\"* means it is intended for a translation table. The purpose of the input functions is to 1) handle the download; 2) read the raw data; 3) extract the relevant parts; 4) do the specific part of processing, i.e. bring the data to a state when it is suitable for the generic database classes for further processing. The outputs of these functions is not standard in any ways, though you may observ repeated patterns. The input functions typically return lists or dictionaries. These are arbitrarily designed towards the aims of selecting the relevant fields and give straightforward, accessible Python data structures for processing within or outside of *pypath*. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use SIGNOR as an example because this resource provides data for almost all OmniPath databases. The `signor_complexes` function returns a set of `pypath.internals.intera.Complex` objects, ready to be added to the OmniPath complexes database (built by `pypath.core.complex.ComplexAggregator`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:24:43.157542Z",
     "start_time": "2022-12-03T15:24:43.144593Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'COMPLEX:P23511_P25208_Q13952': Complex NFY: COMPLEX:P23511_P25208_Q13952,\n",
       " 'COMPLEX:P68104_P85299_Q6R327_Q8TB45_Q9BVC4': Complex mTORC2: COMPLEX:P68104_P85299_Q6R327_Q8TB45_Q9BVC4,\n",
       " 'COMPLEX:P42345_Q8N122_Q8TB45_Q96B36_Q9BVC4': Complex mTORC1: COMPLEX:P42345_Q8N122_Q8TB45_Q96B36_Q9BVC4,\n",
       " 'COMPLEX:P63208_Q13616_Q9Y297': Complex SCF-betaTRCP: COMPLEX:P63208_Q13616_Q9Y297,\n",
       " 'COMPLEX:Q09472_Q92793': Complex CBP/p300: COMPLEX:Q09472_Q92793,\n",
       " 'COMPLEX:Q09472_Q92793_Q92831': Complex P300/PCAF: COMPLEX:Q09472_Q92793_Q92831,\n",
       " 'COMPLEX:Q13485_Q15796': Complex SMAD2/SMAD4: COMPLEX:Q13485_Q15796,\n",
       " 'COMPLEX:P84022_Q13485': Complex SMAD3/SMAD4: COMPLEX:P84022_Q13485,\n",
       " 'COMPLEX:P05412_Q13485': Complex SMAD4/JUN: COMPLEX:P05412_Q13485,\n",
       " 'COMPLEX:Q15796_Q9HAU4': Complex SMAD2/SMURF2: COMPLEX:Q15796_Q9HAU4,\n",
       " 'COMPLEX:O15105_Q01094_Q13547': Complex SMAD7/HDAC1/E2F-1: COMPLEX:O15105_Q01094_Q13547,\n",
       " 'COMPLEX:P19838_Q04206': Complex NfKb-p65/p50: COMPLEX:P19838_Q04206,\n",
       " 'COMPLEX:O14920_O15111': Complex IK"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 17699 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import signor\n",
    "signor.signor_complexes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `signor_interactions` function returns a list of arbitrary tuples that represent the most important properties of SIGNOR interaction records in a human readable way, and ready to be processed by the `pypath.core.network.Network` object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:11:52.195363Z",
     "start_time": "2022-12-03T14:11:51.895162Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[SignorInteraction(source='O15530', target='O15530', source_isoform=None, target_isoform=None, source_type='protein', target_type='protein', effect='unknown', mechanism='phosphorylation', ncbi_tax_id='9606', pubmeds='10455013', direct=True, ptm_type='phosphorylation', ptm_residue='Ser396', ptm_motif='SSSSSSHsLSASDTG'),\n",
       " SignorInteraction(source='Q9NQ66', target='CHEBI:18035', source_isoform=None, target_isoform=None, source_type='protein', target_type='smallmolecule', effect='up-regulates quantity', mechanism='', ncbi_tax_id='-1', pubmeds='23880553', direct=True, ptm_type='', ptm_residue='Small molecule catalysis', ptm_motif=''),\n",
       " SignorInteraction(source='P62136', target='O15169', source_isoform=None, target_isoform=None, source_type='protein', target_type='protein', effect='down-regulates activity', mechanism='dephosphorylation', ncbi_tax_id='9606', pubmeds='17318175', direct=True, ptm_type='dephosphorylation', ptm_residue='Ser77', ptm_motif='YEPEGSAsPTPPYLK'),\n",
       " SignorInteraction(sou"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 3285 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "signor.signor_interactions()[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note, the records above contain also enzyme-PTM data, hence the `signor.signor_enzyme_substrate` function only converts them to an intermediate format to make it easier to process for `pypath.core.enz_sub.EnzymeSubstrateAggregator`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:58:20.642926Z",
     "start_time": "2022-12-02T13:58:20.324789Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'typ': 'phosphorylation',\n",
       "  'resnum': 396,\n",
       "  'instance': 'SSSSSSHSLSASDTG',\n",
       "  'substrate': 'O15530',\n",
       "  'start': 389,\n",
       "  'end': 403,\n",
       "  'kinase': 'O15530',\n",
       "  'resaa': 'S',\n",
       "  'motif': 'SSSSSSHSLSASDTG',\n",
       "  'enzyme_isoform': None,\n",
       "  'substrate_isoform': None,\n",
       "  'references': {'10455013'}},\n",
       " {'typ': 'dephosphorylation',\n",
       "  'resnum': 77,\n",
       "  'instance': 'YEPEGSASPTPPYLK',\n",
       "  'substrate': 'O15169',\n",
       "  'start': 70,\n",
       "  'end': 84,\n",
       "  'kinase': 'P62136',\n",
       "  'resaa': 'S',\n",
       "  'motif': 'YEPEGSASPTPPYLK',\n",
       "  'enzyme_isoform': None,\n",
       "  'substrate_isoform': None,\n",
       "  'references': {'17318175'}}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "signor.signor_enzyme_substrate()[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, SIGNOR also assigns proteins to pathways. This information is intended for the OmniPath annotations database, and retrieved by the `signor.signor_pathway_annotations` function. This function returns a dict of sets which is typical for *_annotation* functions. This format requires practically no further processing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:58:28.574530Z",
     "start_time": "2022-12-02T13:58:27.092583Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{SignorPathway(pathway='TNF alpha'),\n",
       " SignorPathway(pathway='Toll like receptors')}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "signor.signor_pathway_annotations()['O14733']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We haven't mention all functions in the `inputs.signor` module. The rest of the functions retrieve additional information needed by the four functions above, and are of limited direct use for users. For example, `signor_protein_families` returns a dict with the internal ID and members of protein families; this data is used to process the interactions and complexes, but not too interesting on its own."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:58:53.069909Z",
     "start_time": "2022-12-02T13:58:53.064319Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Q9HBW0', 'Q92633', 'Q9UBY5']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "signor.signor_protein_families()['SIGNOR-PF2']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interesting resources\n",
    "\n",
    "Here we showcase a few potentially useful features in `pypath.inputs`.\n",
    "\n",
    "### RaMP\n",
    "\n",
    "[RaMP](https://rampdb.nih.gov/) is a human metabolite and metabolic network database providing ID translation, annotations and enzymatic reactions of metabolites. Let's take a closer look first at the full database contents. It is available as a MySQL database, below we list the tables and their column names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T16:51:14.317326Z",
     "start_time": "2023-03-09T16:51:12.118178Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'analyte': ['rampId', 'type'],\n",
       " 'analytehasontology': ['rampCompoundId', 'rampOntologyId'],\n",
       " 'analytehaspathway': ['rampId', 'pathwayRampId', 'pathwaySource'],\n",
       " 'analytesynonym': ['Synonym', 'rampId', 'geneOrCompound', 'source'],\n",
       " 'catalyzed': ['rampCompoundId', 'rampGeneId'],\n",
       " 'chem_props': ['ramp_id',\n",
       "  'chem_data_source',\n",
       "  'chem_source_id',\n",
       "  'iso_smiles',\n",
       "  'inchi_key_prefix',\n",
       "  'inchi_key',\n",
       "  'inchi',\n",
       "  'mw',\n",
       "  'monoisotop_mass',\n",
       "  'common_name',\n",
       "  'mol_formula'],\n",
       " 'db_version': ['ramp_version',\n",
       "  'load_timestamp',\n",
       "  'version_notes',\n",
       "  'met_intersects_json',\n",
       "  'gene_intersects_json',\n",
       "  'met_intersects_json_pw_mapped',\n",
       "  'gene_intersects_json_pw_mapped',\n",
       "  'db_sql_url'],\n",
       " 'entity_status_info': ['status_category',\n",
       "  'entity_source_id',\n",
       "  'entity_source_name',\n",
       "  'entity_count'],\n",
       " 'metabolite_class': ['ramp_id',\n",
       "  'class_source_id',\n",
       "  'class_level_name',\n",
       "  'class_name',\n",
       "  'source'],\n",
       " 'ontology': ['rampOntologyId', 'commonName', 'HMDBOntologyType', 'metCount'],\n",
       " 'pathway': ['pathwayR"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1368 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import ramp\n",
    "ramp.ramp_list_tables()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the `ramp_raw` function, we can access these tables either as Python dicts, or `pandas.DataFrame`s, or loaded into an `SQLite` database. For further inspection, the data frames are the most convenient. Most of the ID translation data is contained in the `source` table:\n",
    "\n",
    "<div class=\"alert alert-block alert-success\"><b>Note:</b> At the very first time, retrieving these tables takes quite some time, not only due to the large download, but also a performance bottleneck when processing the MySQL dumps. Thanks to caching, loading the tables subsequently happens much faster.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T16:54:17.752634Z",
     "start_time": "2023-03-09T16:54:13.502024Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sourceId</th>\n",
       "      <th>rampId</th>\n",
       "      <th>IDtype</th>\n",
       "      <th>geneOrCompound</th>\n",
       "      <th>commonName</th>\n",
       "      <th>priorityHMDBStatus</th>\n",
       "      <th>dataSource</th>\n",
       "      <th>pathwayCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hmdb:HMDB0000001</td>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>compound</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>quantified</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hmdb:HMDB0000479</td>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>compound</td>\n",
       "      <td>3-Methylhistidine</td>\n",
       "      <td>quantified</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>chebi:50599</td>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>chebi</td>\n",
       "      <td>compound</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>quantified</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chemspider:83153</td>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>chemspider</td>\n",
       "      <td>compound</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>quantified</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>kegg:C01152</td>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>kegg</td>\n",
       "      <td>compound</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>quantified</td>\n",
       "      <td>hmdb_kegg</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756552</th>\n",
       "      <td>uniprot:H0YDB7</td>\n",
       "      <td>RAMP_G_000009307</td>\n",
       "      <td>uniprot</td>\n",
       "      <td>gene</td>\n",
       "      <td>RAB38</td>\n",
       "      <td>NULL</td>\n",
       "      <td>wiki</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756553</th>\n",
       "      <td>uniprot:A0A024R191</td>\n",
       "      <td>RAMP_G_000009307</td>\n",
       "      <td>uniprot</td>\n",
       "      <td>gene</td>\n",
       "      <td>RAB38</td>\n",
       "      <td>NULL</td>\n",
       "      <td>wiki</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756554</th>\n",
       "      <td>uniprot:H0YEA4</td>\n",
       "      <td>RAMP_G_000009307</td>\n",
       "      <td>uniprot</td>\n",
       "      <td>gene</td>\n",
       "      <td>RAB38</td>\n",
       "      <td>NULL</td>\n",
       "      <td>wiki</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756555</th>\n",
       "      <td>entrez:23682</td>\n",
       "      <td>RAMP_G_000009307</td>\n",
       "      <td>entrez</td>\n",
       "      <td>gene</td>\n",
       "      <td>RAB38</td>\n",
       "      <td>NULL</td>\n",
       "      <td>wiki</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756556</th>\n",
       "      <td>gene_symbol:RAB38</td>\n",
       "      <td>RAMP_G_000009307</td>\n",
       "      <td>gene_symbol</td>\n",
       "      <td>gene</td>\n",
       "      <td>RAB38</td>\n",
       "      <td>NULL</td>\n",
       "      <td>wiki</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>756557 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  sourceId            rampId       IDtype geneOrCompound  \\\n",
       "0         hmdb:HMDB0000001  RAMP_C_000000001         hmdb       compound   \n",
       "1         hmdb:HMDB0000479  RAMP_C_000000001         hmdb       compound   \n",
       "2              chebi:50599  RAMP_C_000000001        chebi       compound   \n",
       "3         chemspider:83153  RAMP_C_000000001   chemspider       compound   \n",
       "4              kegg:C01152  RAMP_C_000000001         kegg       compound   \n",
       "...                    ...               ...          ...            ...   \n",
       "756552      uniprot:H0YDB7  RAMP_G_000009307      uniprot           gene   \n",
       "756553  uniprot:A0A024R191  RAMP_G_000009307      uniprot           gene   \n",
       "756554      uniprot:H0YEA4  RAMP_G_000009307      uniprot           gene   \n",
       "756555        entrez:23682  RAMP_G_000009307       entrez           gene   \n",
       "756556   gene_symbol:RAB38  RAMP_G_000009307  gene_symbol           gene   \n",
       "\n",
       "               commonName priorityHMDBStatus dataSource pathwayCount  \n",
       "0       1-Methylhistidine         quantified       hmdb            2  \n",
       "1       3-Methylhistidine         quantified       hmdb            2  \n",
       "2       1-Methylhistidine         quantified       hmdb            2  \n",
       "3       1-Methylhistidine         quantified       hmdb            2  \n",
       "4       1-Methylhistidine         quantified  hmdb_kegg            2  \n",
       "...                   ...                ...        ...          ...  \n",
       "756552              RAB38               NULL       wiki           10  \n",
       "756553              RAB38               NULL       wiki           10  \n",
       "756554              RAB38               NULL       wiki           10  \n",
       "756555              RAB38               NULL       wiki           10  \n",
       "756556              RAB38               NULL       wiki           10  \n",
       "\n",
       "[756557 rows x 8 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tables = ramp.ramp_raw(['analytesynonym', 'chem_props', 'source'])\n",
    "tables['source']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Structural and physicochemical info is available in the `chem_props` table:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:00:46.429206Z",
     "start_time": "2023-03-09T17:00:46.414892Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ramp_id</th>\n",
       "      <th>chem_data_source</th>\n",
       "      <th>chem_source_id</th>\n",
       "      <th>iso_smiles</th>\n",
       "      <th>inchi_key_prefix</th>\n",
       "      <th>inchi_key</th>\n",
       "      <th>inchi</th>\n",
       "      <th>mw</th>\n",
       "      <th>monoisotop_mass</th>\n",
       "      <th>common_name</th>\n",
       "      <th>mol_formula</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>hmdb:HMDB0000001</td>\n",
       "      <td>[H]OC(=O)[C@@]([H])(N([H])[H])C([H])([H])C1=C(...</td>\n",
       "      <td>BRMWTNUJHUMWMS</td>\n",
       "      <td>BRMWTNUJHUMWMS-LURJTMIESA-N</td>\n",
       "      <td>InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...</td>\n",
       "      <td>169.181</td>\n",
       "      <td>169.085</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>C7H11N3O2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>hmdb:HMDB0000479</td>\n",
       "      <td>[H][C@](N)(CC1=CN=CN1C)C(O)=O</td>\n",
       "      <td>JDHILDINMRGULE</td>\n",
       "      <td>JDHILDINMRGULE-LURJTMIESA-N</td>\n",
       "      <td>InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...</td>\n",
       "      <td>169.181</td>\n",
       "      <td>169.085</td>\n",
       "      <td>3-Methylhistidine</td>\n",
       "      <td>C7H11N3O2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>chebi</td>\n",
       "      <td>chebi:27596</td>\n",
       "      <td>Cn1cncc1C[C@H](N)C(O)=O</td>\n",
       "      <td>JDHILDINMRGULE</td>\n",
       "      <td>JDHILDINMRGULE-LURJTMIESA-N</td>\n",
       "      <td>InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>169.085</td>\n",
       "      <td>N(pros)-methyl-L-histidine</td>\n",
       "      <td>C7H11N3O2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>RAMP_C_000000001</td>\n",
       "      <td>chebi</td>\n",
       "      <td>chebi:50599</td>\n",
       "      <td>Cn1cnc(C[C@H](N)C(O)=O)c1</td>\n",
       "      <td>BRMWTNUJHUMWMS</td>\n",
       "      <td>BRMWTNUJHUMWMS-LURJTMIESA-N</td>\n",
       "      <td>InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>169.085</td>\n",
       "      <td>N(tele)-methyl-L-histidine</td>\n",
       "      <td>C7H11N3O2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>RAMP_C_000000002</td>\n",
       "      <td>hmdb</td>\n",
       "      <td>hmdb:HMDB0000002</td>\n",
       "      <td>NCCCN</td>\n",
       "      <td>XFNJVJPLKCPIBV</td>\n",
       "      <td>XFNJVJPLKCPIBV-UHFFFAOYSA-N</td>\n",
       "      <td>InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2</td>\n",
       "      <td>74.1249</td>\n",
       "      <td>74.0844</td>\n",
       "      <td>1,3-Diaminopropane</td>\n",
       "      <td>C3H10N2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275898</th>\n",
       "      <td>RAMP_C_000258279</td>\n",
       "      <td>lipidmaps</td>\n",
       "      <td>LIPIDMAPS:LMPK15050003</td>\n",
       "      <td>C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCCCC)=C(O...</td>\n",
       "      <td>UXLMJHNFDRMGPW</td>\n",
       "      <td>UXLMJHNFDRMGPW-LJQANCHMSA-N</td>\n",
       "      <td>InChI=1S/C24H38O6/c1-4-5-6-7-8-9-10-11-12-13-1...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>422.267</td>\n",
       "      <td>2-hydroxy-5-methoxy-3-(2R-acetoxy-pentadecyl)-...</td>\n",
       "      <td>C24H38O6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275899</th>\n",
       "      <td>RAMP_C_000258280</td>\n",
       "      <td>lipidmaps</td>\n",
       "      <td>LIPIDMAPS:LMPK15050004</td>\n",
       "      <td>C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCCCC)=CC(...</td>\n",
       "      <td>CVZNKLNAHBTINT</td>\n",
       "      <td>CVZNKLNAHBTINT-JOCHJYFZSA-N</td>\n",
       "      <td>InChI=1S/C24H38O5/c1-4-5-6-7-8-9-10-11-12-13-1...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>406.272</td>\n",
       "      <td>5-methoxy-3-(2R-acetoxy-pentadecyl)-1,4-benzoq...</td>\n",
       "      <td>C24H38O5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275900</th>\n",
       "      <td>RAMP_C_000226089</td>\n",
       "      <td>lipidmaps</td>\n",
       "      <td>LIPIDMAPS:LMPK15050005</td>\n",
       "      <td>C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCC)=CC(=O...</td>\n",
       "      <td>JIUGZSYPFREDLG</td>\n",
       "      <td>JIUGZSYPFREDLG-HXUWFJFHSA-N</td>\n",
       "      <td>InChI=1S/C22H34O5/c1-4-5-6-7-8-9-10-11-12-13-2...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>378.241</td>\n",
       "      <td>5-methoxy-3-(2R-acetoxy-tridecyl)-1,4-benzoqui...</td>\n",
       "      <td>C22H34O5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275901</th>\n",
       "      <td>RAMP_C_000258283</td>\n",
       "      <td>lipidmaps</td>\n",
       "      <td>LIPIDMAPS:LMPK15050008</td>\n",
       "      <td>C1(O)C(=O)C(CCCCCCCCCCCCCCC)=C(O)C(=O)C=1</td>\n",
       "      <td>GXDURRGUXLDZKN</td>\n",
       "      <td>GXDURRGUXLDZKN-UHFFFAOYSA-N</td>\n",
       "      <td>InChI=1S/C21H34O4/c1-2-3-4-5-6-7-8-9-10-11-12-...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>350.246</td>\n",
       "      <td>Suberonone</td>\n",
       "      <td>C21H34O4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275902</th>\n",
       "      <td>RAMP_C_000258284</td>\n",
       "      <td>lipidmaps</td>\n",
       "      <td>LIPIDMAPS:LMPK15050009</td>\n",
       "      <td>C1(O)C(=O)C(CCCCCCCCCCCCC)=C(O)C(=O)C=1</td>\n",
       "      <td>AMKNOBHCKRZHIO</td>\n",
       "      <td>AMKNOBHCKRZHIO-UHFFFAOYSA-N</td>\n",
       "      <td>InChI=1S/C19H30O4/c1-2-3-4-5-6-7-8-9-10-11-12-...</td>\n",
       "      <td>NULL</td>\n",
       "      <td>322.214</td>\n",
       "      <td>Rapanone</td>\n",
       "      <td>C19H30O4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>275903 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 ramp_id chem_data_source          chem_source_id  \\\n",
       "0       RAMP_C_000000001             hmdb        hmdb:HMDB0000001   \n",
       "1       RAMP_C_000000001             hmdb        hmdb:HMDB0000479   \n",
       "2       RAMP_C_000000001            chebi             chebi:27596   \n",
       "3       RAMP_C_000000001            chebi             chebi:50599   \n",
       "4       RAMP_C_000000002             hmdb        hmdb:HMDB0000002   \n",
       "...                  ...              ...                     ...   \n",
       "275898  RAMP_C_000258279        lipidmaps  LIPIDMAPS:LMPK15050003   \n",
       "275899  RAMP_C_000258280        lipidmaps  LIPIDMAPS:LMPK15050004   \n",
       "275900  RAMP_C_000226089        lipidmaps  LIPIDMAPS:LMPK15050005   \n",
       "275901  RAMP_C_000258283        lipidmaps  LIPIDMAPS:LMPK15050008   \n",
       "275902  RAMP_C_000258284        lipidmaps  LIPIDMAPS:LMPK15050009   \n",
       "\n",
       "                                               iso_smiles inchi_key_prefix  \\\n",
       "0       [H]OC(=O)[C@@]([H])(N([H])[H])C([H])([H])C1=C(...   BRMWTNUJHUMWMS   \n",
       "1                           [H][C@](N)(CC1=CN=CN1C)C(O)=O   JDHILDINMRGULE   \n",
       "2                                 Cn1cncc1C[C@H](N)C(O)=O   JDHILDINMRGULE   \n",
       "3                               Cn1cnc(C[C@H](N)C(O)=O)c1   BRMWTNUJHUMWMS   \n",
       "4                                                   NCCCN   XFNJVJPLKCPIBV   \n",
       "...                                                   ...              ...   \n",
       "275898  C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCCCC)=C(O...   UXLMJHNFDRMGPW   \n",
       "275899  C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCCCC)=CC(...   CVZNKLNAHBTINT   \n",
       "275900  C1(OC)C(=O)C(C[C@H](OC(C)=O)CCCCCCCCCCC)=CC(=O...   JIUGZSYPFREDLG   \n",
       "275901          C1(O)C(=O)C(CCCCCCCCCCCCCCC)=C(O)C(=O)C=1   GXDURRGUXLDZKN   \n",
       "275902            C1(O)C(=O)C(CCCCCCCCCCCCC)=C(O)C(=O)C=1   AMKNOBHCKRZHIO   \n",
       "\n",
       "                          inchi_key  \\\n",
       "0       BRMWTNUJHUMWMS-LURJTMIESA-N   \n",
       "1       JDHILDINMRGULE-LURJTMIESA-N   \n",
       "2       JDHILDINMRGULE-LURJTMIESA-N   \n",
       "3       BRMWTNUJHUMWMS-LURJTMIESA-N   \n",
       "4       XFNJVJPLKCPIBV-UHFFFAOYSA-N   \n",
       "...                             ...   \n",
       "275898  UXLMJHNFDRMGPW-LJQANCHMSA-N   \n",
       "275899  CVZNKLNAHBTINT-JOCHJYFZSA-N   \n",
       "275900  JIUGZSYPFREDLG-HXUWFJFHSA-N   \n",
       "275901  GXDURRGUXLDZKN-UHFFFAOYSA-N   \n",
       "275902  AMKNOBHCKRZHIO-UHFFFAOYSA-N   \n",
       "\n",
       "                                                    inchi       mw  \\\n",
       "0       InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...  169.181   \n",
       "1       InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...  169.181   \n",
       "2       InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...     NULL   \n",
       "3       InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...     NULL   \n",
       "4                      InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2  74.1249   \n",
       "...                                                   ...      ...   \n",
       "275898  InChI=1S/C24H38O6/c1-4-5-6-7-8-9-10-11-12-13-1...     NULL   \n",
       "275899  InChI=1S/C24H38O5/c1-4-5-6-7-8-9-10-11-12-13-1...     NULL   \n",
       "275900  InChI=1S/C22H34O5/c1-4-5-6-7-8-9-10-11-12-13-2...     NULL   \n",
       "275901  InChI=1S/C21H34O4/c1-2-3-4-5-6-7-8-9-10-11-12-...     NULL   \n",
       "275902  InChI=1S/C19H30O4/c1-2-3-4-5-6-7-8-9-10-11-12-...     NULL   \n",
       "\n",
       "       monoisotop_mass                                        common_name  \\\n",
       "0              169.085                                  1-Methylhistidine   \n",
       "1              169.085                                  3-Methylhistidine   \n",
       "2              169.085                         N(pros)-methyl-L-histidine   \n",
       "3              169.085                         N(tele)-methyl-L-histidine   \n",
       "4              74.0844                                 1,3-Diaminopropane   \n",
       "...                ...                                                ...   \n",
       "275898         422.267  2-hydroxy-5-methoxy-3-(2R-acetoxy-pentadecyl)-...   \n",
       "275899         406.272  5-methoxy-3-(2R-acetoxy-pentadecyl)-1,4-benzoq...   \n",
       "275900         378.241  5-methoxy-3-(2R-acetoxy-tridecyl)-1,4-benzoqui...   \n",
       "275901         350.246                                         Suberonone   \n",
       "275902         322.214                                           Rapanone   \n",
       "\n",
       "       mol_formula  \n",
       "0        C7H11N3O2  \n",
       "1        C7H11N3O2  \n",
       "2        C7H11N3O2  \n",
       "3        C7H11N3O2  \n",
       "4          C3H10N2  \n",
       "...            ...  \n",
       "275898    C24H38O6  \n",
       "275899    C24H38O5  \n",
       "275900    C22H34O5  \n",
       "275901    C21H34O4  \n",
       "275902    C19H30O4  \n",
       "\n",
       "[275903 rows x 11 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tables['chem_props']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Raw RaMP data can be accessed also as an SQLite database. The advantage here is the high performance and flexibility of operations. Conversion to `pandas` and vice versa is really easy, you can always have the result in a data frame. Below, `con` is a database connection ready to execute your queries. It is an in-memory database, using alternatively an on-disk database is possible. We use `pypath.formats.sqlite` to look into the SQLite database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:07:00.293203Z",
     "start_time": "2023-03-09T17:06:49.731521Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<sqlite3.Connection at 0x6fa1e9e4e940>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "con = ramp.ramp_raw(['source', 'chem_props', 'analytesynonym'], sqlite = True)\n",
    "con"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we have already loaded these 3 big tables both as data frames and as SQLite tables, let's see how much memory they use (normally half is enough, and they should stay in the memory only for short periods):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:07:44.743103Z",
     "start_time": "2023-03-09T17:07:44.738000Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.7 GB'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.share import common\n",
    "common.format_bytes(common.python_memory_usage())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looking into the database, we see the 3 tables loaded, and their column names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:13:01.939478Z",
     "start_time": "2023-03-09T17:13:01.933955Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'source': ['sourceId',\n",
       "  'rampId',\n",
       "  'IDtype',\n",
       "  'geneOrCompound',\n",
       "  'commonName',\n",
       "  'priorityHMDBStatus',\n",
       "  'dataSource',\n",
       "  'pathwayCount'],\n",
       " 'analytesynonym': ['Synonym', 'rampId', 'geneOrCompound', 'source'],\n",
       " 'chem_props': ['ramp_id',\n",
       "  'chem_data_source',\n",
       "  'chem_source_id',\n",
       "  'iso_smiles',\n",
       "  'inchi_key_prefix',\n",
       "  'inchi_key',\n",
       "  'inchi',\n",
       "  'mw',\n",
       "  'monoisotop_mass',\n",
       "  'common_name',\n",
       "  'mol_formula']}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.formats import sqlite\n",
    "sqlite.list_columns(con)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's see how to execute an SQL query and fetch the output into a data frame. This query takes the `source` table, selects the records with HMDB and ChEBI IDs in two subqueries, and joins the two by `rampId`, in order to obtain a `HMDB ←→ ChEBI` mapping table:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:18:37.483396Z",
     "start_time": "2023-03-09T17:18:36.760340Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hmdb</th>\n",
       "      <th>chebi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hmdb:HMDB0000001</td>\n",
       "      <td>chebi:27596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hmdb:HMDB0000001</td>\n",
       "      <td>chebi:50599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>hmdb:HMDB0000479</td>\n",
       "      <td>chebi:27596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>hmdb:HMDB0000479</td>\n",
       "      <td>chebi:50599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>hmdb:HMDB00001</td>\n",
       "      <td>chebi:27596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104129</th>\n",
       "      <td>hmdb:HMDB0126033</td>\n",
       "      <td>chebi:25882</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104130</th>\n",
       "      <td>hmdb:HMDB0141947</td>\n",
       "      <td>chebi:180150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104131</th>\n",
       "      <td>hmdb:HMDB0128505</td>\n",
       "      <td>chebi:7870</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104132</th>\n",
       "      <td>hmdb:HMDB0130984</td>\n",
       "      <td>chebi:8227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104133</th>\n",
       "      <td>hmdb:HMDB0130987</td>\n",
       "      <td>chebi:8630</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>104134 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    hmdb         chebi\n",
       "0       hmdb:HMDB0000001   chebi:27596\n",
       "1       hmdb:HMDB0000001   chebi:50599\n",
       "2       hmdb:HMDB0000479   chebi:27596\n",
       "3       hmdb:HMDB0000479   chebi:50599\n",
       "4         hmdb:HMDB00001   chebi:27596\n",
       "...                  ...           ...\n",
       "104129  hmdb:HMDB0126033   chebi:25882\n",
       "104130  hmdb:HMDB0141947  chebi:180150\n",
       "104131  hmdb:HMDB0128505    chebi:7870\n",
       "104132  hmdb:HMDB0130984    chebi:8227\n",
       "104133  hmdb:HMDB0130987    chebi:8630\n",
       "\n",
       "[104134 rows x 2 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "query = (\n",
    "    'SELECT DISTINCT a.sourceId as hmdb, b.sourceId as chebi '\n",
    "    'FROM '\n",
    "    '   (SELECT sourceId, rampId '\n",
    "    '    FROM source '\n",
    "    '   WHERE geneOrCompound = \"compound\" AND IDtype = \"hmdb\") a '\n",
    "    'JOIN '\n",
    "    '   (SELECT sourceId, rampId '\n",
    "    '    FROM source '\n",
    "    '   WHERE geneOrCompound = \"compound\" AND IDtype = \"chebi\") b '\n",
    "    'ON a.rampId = b.rampId;'\n",
    ")\n",
    "df = pd.read_sql_query(query, con)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Such mapping tables can be easily accessed for any pairs of identifiers by the `ramp_mapping` function. Before that, let's see the complete list of supported ID types:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:23:09.965922Z",
     "start_time": "2023-03-09T17:23:05.520835Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CAS',\n",
       " 'EN',\n",
       " 'LIPIDMAPS',\n",
       " 'brenda',\n",
       " 'chebi',\n",
       " 'chemspider',\n",
       " 'ensembl',\n",
       " 'entrez',\n",
       " 'gene_symbol',\n",
       " 'hmdb',\n",
       " 'kegg',\n",
       " 'kegg_glycan',\n",
       " 'lipidbank',\n",
       " 'ncbiprotein',\n",
       " 'plantfa',\n",
       " 'pubchem',\n",
       " 'swisslipids',\n",
       " 'uniprot',\n",
       " 'wikidata'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ramp.ramp_id_types()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:29:17.447027Z",
     "start_time": "2023-03-09T17:29:12.507465Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LMFA00000008': {'SLM:000390048'},\n",
       " 'LMFA01010001': {'SLM:000000510'},\n",
       " 'LMFA01010002': {'SLM:000000449'},\n",
       " 'LMFA01010003': {'SLM:000001194'},\n",
       " 'LMFA01010004': {'SLM:000001195'},\n",
       " 'LMFA01010005': {'SLM:000389552'},\n",
       " 'LMFA01010006': {'SLM:000001196'},\n",
       " 'LMFA01010007': {'SLM:000389947'},\n",
       " 'LMFA01010008': {'SLM:000000853'},\n",
       " 'LMFA01010010': {'SLM:000000855'},\n",
       " 'LMFA01010011': {'SLM:000389946'},\n",
       " 'LMFA01010012': {'SLM:000000719'},\n",
       " 'LMFA01010013': {'SLM:000001198'},\n",
       " 'LMFA01010014': {'SLM:000000825'},\n",
       " 'LMFA01010015': {'SLM:000001199'},\n",
       " 'LMFA01010017': {'SLM:000001095'},\n",
       " 'LMFA01010019': {'SLM:000001205'},\n",
       " 'LMFA01010020': {'SLM:000000829'},\n",
       " 'LMFA01010021': {'SLM:000001207'},\n",
       " 'LMFA01010022': {'SLM:000000827'},\n",
       " 'LMFA01010023': {'SLM:000001128'},\n",
       " 'LMFA01010024': {'SLM:000000414'},\n",
       " 'LMFA01010026': {'SLM:000000539'},\n",
       " 'LMFA01010027': {'SLM:000000980'},\n",
       " 'LMFA01010028': {'SLM:000000540'},\n",
       " 'LMFA01010030': {'SLM:000000543'},\n",
       " 'LMFA01010032': {'SLM:000000544'},\n",
       " 'LMFA01010034': {'SLM:00000"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 44684 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ramp.ramp_mapping('LIPIDMAPS', 'swisslipids')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Above we got a dict of sets, alternatively data frames are available:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:30:27.994466Z",
     "start_time": "2023-03-09T17:30:23.362461Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_type_a</th>\n",
       "      <th>id_type_b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LMST02030086</td>\n",
       "      <td>SLM:000485328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LMST02030087</td>\n",
       "      <td>SLM:000485330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LMSP06020013</td>\n",
       "      <td>SLM:000000534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LMST02020001</td>\n",
       "      <td>SLM:000001055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LMST02020001</td>\n",
       "      <td>SLM:000485315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35218</th>\n",
       "      <td>LMPR0104010007</td>\n",
       "      <td>SLM:000389242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35219</th>\n",
       "      <td>LMPR0104030005</td>\n",
       "      <td>SLM:000390232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35220</th>\n",
       "      <td>LMPR0104030006</td>\n",
       "      <td>SLM:000390227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35221</th>\n",
       "      <td>LMPR01070626</td>\n",
       "      <td>SLM:000000432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35222</th>\n",
       "      <td>LMPR01090015</td>\n",
       "      <td>SLM:000389419</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>35223 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            id_type_a      id_type_b\n",
       "0        LMST02030086  SLM:000485328\n",
       "1        LMST02030087  SLM:000485330\n",
       "2        LMSP06020013  SLM:000000534\n",
       "3        LMST02020001  SLM:000001055\n",
       "4        LMST02020001  SLM:000485315\n",
       "...               ...            ...\n",
       "35218  LMPR0104010007  SLM:000389242\n",
       "35219  LMPR0104030005  SLM:000390232\n",
       "35220  LMPR0104030006  SLM:000390227\n",
       "35221    LMPR01070626  SLM:000000432\n",
       "35222    LMPR01090015  SLM:000389419\n",
       "\n",
       "[35223 rows x 2 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ramp.ramp_mapping('LIPIDMAPS', 'swisslipids', return_df = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "RaMP ID translation is also integrated into the higher level APIs in `pypath.utils.mapping`. Below, we first look into the available ID types and translation tables:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:38:25.801208Z",
     "start_time": "2023-03-09T17:38:25.794435Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{IdType(pypath='CAS', original='CAS'),\n",
       " IdType(pypath='LIPIDMAPS', original='LIPIDMAPS'),\n",
       " IdType(pypath='MedChemExpress', original='MedChemExpress'),\n",
       " IdType(pypath='actor', original='actor'),\n",
       " IdType(pypath='affy', original='affy'),\n",
       " IdType(pypath='affymetrix', original='affymetrix'),\n",
       " IdType(pypath='agilent', original='agilent'),\n",
       " IdType(pypath='alzforum', original='Alzforum_mut'),\n",
       " IdType(pypath='araport', original='Araport'),\n",
       " IdType(pypath='atlas', original='atlas'),\n",
       " IdType(pypath='bindingdb', original='bindingdb'),\n",
       " IdType(pypath='brenda', original='brenda'),\n",
       " IdType(pypath='carotenoiddb', original='carotenoiddb'),\n",
       " IdType(pypath='cas', original='CAS'),\n",
       " IdType(pypath='cas_id', original='CAS'),\n",
       " IdType(pypath='cgnc', original='CGNC'),\n",
       " IdType(pypath='chebi', original='chebi'),\n",
       " IdType(pypath='chembl', original='chembl'),\n",
       " IdType(pypath='chemicalbook', original='chemicalbook'),\n",
       " IdType(pypath='chemspider', original='chemspider'),\n",
       " IdType(pypath='clinicaltrials', original='clinic"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 7422 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.utils import mapping\n",
    "m = mapping.get_mapper()\n",
    "m.id_types()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These are ID types not only from RaMP, but all the supported resources. In the mapping table definitions, as translation between any two ID types is supported, `id_type_b` is always `None`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:46:56.952034Z",
     "start_time": "2023-03-09T17:46:56.946851Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[MappingTableDefinition(id_type_a='kegg_glycan', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='kegg_glycan', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='hmdb', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='hmdb', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='wikidata', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='wikidata', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='LIPIDMAPS', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='LIPIDMAPS', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='kegg', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='kegg', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='CAS', id_type_b=None, resource='ramp', input_class='RampMapping', resource_id_type_a='CAS', resource_id_type_b=None),\n",
       " MappingTableDefinition(id_type_a='chebi"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 3238 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "[t for t in m.mapping_tables() if t.resource == 'ramp']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TL;DR\n",
    "\n",
    "Up until this point this section is about extra insights, but what 99% of the users will do looks like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-09T17:53:38.365570Z",
     "start_time": "2023-03-09T17:53:38.194183Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'HMDB0094709'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import mapping\n",
    "mapping.map_name('131431', 'chebi', 'hmdb')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### HMDB (Human Metabolome Database)\n",
    "\n",
    "#### Direct access to HMDB data\n",
    "\n",
    "In the `inputs.hmdb` module processes metabolite and protein data using `lxml.etree` and some minimal utilities from `formats.xml`. The metabolite or protein records are available as `lxml.etree.Element` objects, or custom fields can be extracted into dicts, or into data frames. To iterate through the xml elements, each representing a metabolite:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:23:11.890142Z",
     "start_time": "2023-04-24T12:23:11.011505Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element {http://www.hmdb.ca}metabolite at 0x60b1846262c0>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import hmdb\n",
    "next(hmdb.iter_metabolites())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "On the `Element` objects you can use directly `lxml.etree`'s methods to extract information. An easier and flexible way to extract information from these XML records is to define a schema with instructions for `lxml`. A full schema for HMDB metabolites is available in `hmdb.SCHEMA`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:24:03.684102Z",
     "start_time": "2023-04-24T12:24:03.676800Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'taxonomy': ('taxonomy',\n",
       "  {'description': ('description', None),\n",
       "   'direct_parent': ('direct_parent', None),\n",
       "   'kingdom': ('kingdom', None),\n",
       "   'class': ('class', None),\n",
       "   'sub_class': ('sub_class', None),\n",
       "   'molecular_framework': ('molecular_framework', None),\n",
       "   'alternative_parents': ('alternative_parents',\n",
       "    ('alternative_parent', 'findall'),\n",
       "    None),\n",
       "   'substituents': ('substituents', ('substituent', 'findall'), None)}),\n",
       " 'spectra': ('spectra', ('spectrum', 'findall'), {'spectrum_id', 'type'}),\n",
       " 'biological_properties': ('biological_properties',\n",
       "  {'cellular_locations': ('cellular_locations', ('cellular', 'findall'), None),\n",
       "   'biospecimen_locations': ('biospecimen_locations',\n",
       "    ('biospecimen', 'findall'),\n",
       "    None),\n",
       "   'tissue_locations': ('tissue_locations', ('tissue', 'findall'), None),\n",
       "   'pathways': ('pathways',\n",
       "    ('pathway', 'findall'),\n",
       "    {'kegg_map_id', 'name', 'smpdb_id'})}),\n",
       " 'experimental_properties': ('experimental_properties',\n",
       "  ('property', 'findall')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 4037 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "hmdb.METABOLITES_SCHEMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The schema for proteins is different:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:24:52.393898Z",
     "start_time": "2023-04-24T12:24:52.386469Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'gene_properties': ('gene_properties',\n",
       "  {'chromosome_location': ('chromosome_location', None),\n",
       "   'locus': ('locus', None),\n",
       "   'gene_sequence': ('gene_sequence', None)}),\n",
       " 'protein_properties': ('protein_properties',\n",
       "  {'residue_number': ('residue_number', None),\n",
       "   'molecular_weight': ('molecular_weight', None),\n",
       "   'theoretical_pi': ('theoretical_pi', None),\n",
       "   'polypeptide_sequence': ('polypeptide_sequence', None),\n",
       "   'transmembrane_regions': ('transmembrane_regions',\n",
       "    ('region', 'findall'),\n",
       "    None),\n",
       "   'signal_regions': ('signal_regions', ('region', 'findall'), None)}),\n",
       " 'pfams': ('pfams', ('pfam', 'findall'), {'name', 'pfam_id'}),\n",
       " 'metabolite_associations': ('metabolite_associations',\n",
       "  ('metabolite', 'findall'),\n",
       "  {'accession', 'name'}),\n",
       " 'go_classifications': ('go_classifications',\n",
       "  ('go_class', 'findall'),\n",
       "  {'category', 'description', 'go_id'}),\n",
       " 'pathways': ('pathways',\n",
       "  ('pathway', 'findall'),\n",
       "  {'kegg_map_id', 'name', 'smpdb_id'}),\n",
       " 'general_references': ('general_"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 2072 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "hmdb.PROTEINS_SCHEMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By default the full schema is used by `hmdb.metabolites_raw` and `hmdb.proteins_raw`, but you can pass a smaller dict with only your fields of interest, largely reducing the processing time. Using the `head` argument we peek into the first N records of the data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:25:31.232024Z",
     "start_time": "2023-04-24T12:25:31.085690Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'taxonomy': {'description': ' belongs to the class of organic compounds known as histidine and derivatives. Histidine and derivatives are compounds containing cysteine or a derivative thereof resulting from reaction of cysteine at the amino group or the carboxy group, or from the replacement of any hydrogen of glycine by a heteroatom.',\n",
       "   'direct_parent': 'Histidine and derivatives',\n",
       "   'kingdom': 'Organic compounds',\n",
       "   'class': 'Carboxylic acids and derivatives',\n",
       "   'sub_class': 'Amino acids, peptides, and analogues',\n",
       "   'molecular_framework': 'Aromatic heteromonocyclic compounds',\n",
       "   'alternative_parents': ['Amino acids',\n",
       "    'Aralkylamines',\n",
       "    'Azacyclic compounds',\n",
       "    'Carbonyl compounds',\n",
       "    'Carboxylic acids',\n",
       "    'Heteroaromatic compounds',\n",
       "    'Hydrocarbon derivatives',\n",
       "    'Imidazolyl carboxylic acids and derivatives',\n",
       "    'L-alpha-amino acids',\n",
       "    'Monoalkylamines',\n",
       "    'Monocarboxylic acids and derivatives',\n",
       "    'N-substituted imidazoles',\n",
       "    'Organic oxides',\n",
       "    "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 132354 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "list(hmdb.metabolites_raw(head = 3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The returned nested dict corresponds to the schema. Another example with a schema that extracts only the accession and name fields:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:25:55.565203Z",
     "start_time": "2023-04-24T12:25:55.527353Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'accession': 'HMDB0000001', 'name': '1-Methylhistidine'},\n",
       " {'accession': 'HMDB0000002', 'name': '1,3-Diaminopropane'},\n",
       " {'accession': 'HMDB0000005', 'name': '2-Ketobutyric acid'},\n",
       " {'accession': 'HMDB0000008', 'name': '2-Hydroxybutyric acid'},\n",
       " {'accession': 'HMDB0000010', 'name': '2-Methoxyestrone'},\n",
       " {'accession': 'HMDB0000011', 'name': '3-Hydroxybutyric acid'},\n",
       " {'accession': 'HMDB0000012', 'name': 'Deoxyuridine'},\n",
       " {'accession': 'HMDB0000014', 'name': 'Deoxycytidine'},\n",
       " {'accession': 'HMDB0000015', 'name': 'Cortexolone'},\n",
       " {'accession': 'HMDB0000016', 'name': 'Deoxycorticosterone'},\n",
       " {'accession': 'HMDB0000017', 'name': '4-Pyridoxic acid'},\n",
       " {'accession': 'HMDB0000019', 'name': 'alpha-Ketoisovaleric acid'},\n",
       " {'accession': 'HMDB0000020', 'name': 'p-Hydroxyphenylacetic acid'},\n",
       " {'accession': 'HMDB0000021', 'name': 'Iodotyrosine'},\n",
       " {'accession': 'HMDB0000022', 'name': '3-Methoxytyramine'},\n",
       " {'accession': 'HMDB0000023', 'name': '(S)-3-Hydroxyisobutyric acid'},\n",
       " {'accession': 'HMDB00"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1291 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "list(hmdb.metabolites_raw(\n",
    "    schema = {\n",
    "        'accession': hmdb.METABOLITES_SCHEMA['accession'],\n",
    "        'name': hmdb.METABOLITES_SCHEMA['name'],\n",
    "    },\n",
    "    head = 20,\n",
    "))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It works a similar way for proteins:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:29:23.500335Z",
     "start_time": "2023-04-24T12:29:23.476261Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': \"5'-nucleotidase\", 'genesymbol': 'NT5E'},\n",
       " {'name': 'Deoxycytidylate deaminase', 'genesymbol': 'DCTD'},\n",
       " {'name': 'UMP-CMP kinase', 'genesymbol': 'CMPK1'},\n",
       " {'name': \"Cytosolic 5'-nucleotidase 1B\", 'genesymbol': 'NT5C1B'},\n",
       " {'name': \"Cytosolic 5'-nucleotidase 1A\", 'genesymbol': 'NT5C1A'},\n",
       " {'name': \"5'(3')-deoxyribonucleotidase, cytosolic type\",\n",
       "  'genesymbol': 'NT5C'},\n",
       " {'name': 'Deoxycytidine kinase', 'genesymbol': 'DCK'},\n",
       " {'name': \"5'(3')-deoxyribonucleotidase, mitochondrial\", 'genesymbol': 'NT5M'},\n",
       " {'name': 'Hydroxymethylglutaryl-CoA lyase, mitochondrial',\n",
       "  'genesymbol': 'HMGCL'},\n",
       " {'name': 'ATP-citrate synthase', 'genesymbol': 'ACLY'},\n",
       " {'name': 'Histone acetyltransferase p300', 'genesymbol': 'EP300'},\n",
       " {'name': 'Pyruvate dehydrogenase E1 component subunit beta, mitochondrial',\n",
       "  'genesymbol': 'PDHB'},\n",
       " {'name': 'Acetyl-CoA acetyltransferase, cytosolic', 'genesymbol': 'ACAT2'},\n",
       " {'name': 'CREB-binding protein', 'genesymbol': 'CREBBP'},\n",
       " {'name': 'Diamine acetyltransfe"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1478 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "list(hmdb.proteins_raw(\n",
    "    schema = {\n",
    "        'name': hmdb.PROTEINS_SCHEMA['name'],\n",
    "        'genesymbol': hmdb.PROTEINS_SCHEMA['gene_name'],\n",
    "    },\n",
    "    head = 20,\n",
    "))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Higher level access to HMDB data\n",
    "\n",
    "By the `hmdb.metabolites_table` and `hmdb.proteins_table` functions you can process the records into a `pandas` data frame. This function accepts list of nameless or named arguments using a simple notation (see its documentation). Instead of the simple notation of tuples, alternatively, `hmdb.Field` objects can be used to define the fields, though the arguments for `Field` and the tuples or strings directly passed to `hmdb.*_table` follow the same format. Let's extract a data frame with SMILEs, InChi Keys and HMDB accessions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:01.806350Z",
     "start_time": "2023-04-24T12:32:01.776383Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accession</th>\n",
       "      <th>smiles</th>\n",
       "      <th>inchikey</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>CN1C=NC(C[C@H](N)C(O)=O)=C1</td>\n",
       "      <td>BRMWTNUJHUMWMS-LURJTMIESA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HMDB0000002</td>\n",
       "      <td>NCCCN</td>\n",
       "      <td>XFNJVJPLKCPIBV-UHFFFAOYSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HMDB0000005</td>\n",
       "      <td>CCC(=O)C(O)=O</td>\n",
       "      <td>TYEYBOSBBBHJIV-UHFFFAOYSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HMDB0000008</td>\n",
       "      <td>CC[C@H](O)C(O)=O</td>\n",
       "      <td>AFENDNXGAFYKQO-VKHMYHEASA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HMDB0000010</td>\n",
       "      <td>[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...</td>\n",
       "      <td>WHEUWNKSCXYKBU-QPWUGHHJSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>HMDB0000011</td>\n",
       "      <td>C[C@@H](O)CC(O)=O</td>\n",
       "      <td>WHBMMWSBFZVSSR-GSVOUGTGSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>HMDB0000012</td>\n",
       "      <td>OC[C@H]1O[C@H](C[C@@H]1O)N1C=CC(=O)NC1=O</td>\n",
       "      <td>MXHRCPNRJAMMIM-SHYZEUOFSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>HMDB0000014</td>\n",
       "      <td>NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](CO)O1</td>\n",
       "      <td>CKTSBUTUHBMZGZ-SHYZEUOFSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>HMDB0000015</td>\n",
       "      <td>[H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)CC[C@@]1(...</td>\n",
       "      <td>WHBHBVVOGNECLV-OBQKJFGGSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>HMDB0000016</td>\n",
       "      <td>[H][C@@]12CC[C@H](C(=O)CO)[C@@]1(C)CC[C@@]1([H...</td>\n",
       "      <td>ZESRJSPZRDMNHY-YFWFAHHUSA-N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>CC1=NC=C(CO)C(C(O)=O)=C1O</td>\n",
       "      <td>HXACOUQIXZGNBF-UHFFFAOYSA-N</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      accession                                             smiles   \n",
       "0   HMDB0000001                        CN1C=NC(C[C@H](N)C(O)=O)=C1  \\\n",
       "1   HMDB0000002                                              NCCCN   \n",
       "2   HMDB0000005                                      CCC(=O)C(O)=O   \n",
       "3   HMDB0000008                                   CC[C@H](O)C(O)=O   \n",
       "4   HMDB0000010  [H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...   \n",
       "5   HMDB0000011                                  C[C@@H](O)CC(O)=O   \n",
       "6   HMDB0000012           OC[C@H]1O[C@H](C[C@@H]1O)N1C=CC(=O)NC1=O   \n",
       "7   HMDB0000014       NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](CO)O1   \n",
       "8   HMDB0000015  [H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)CC[C@@]1(...   \n",
       "9   HMDB0000016  [H][C@@]12CC[C@H](C(=O)CO)[C@@]1(C)CC[C@@]1([H...   \n",
       "10  HMDB0000017                          CC1=NC=C(CO)C(C(O)=O)=C1O   \n",
       "\n",
       "                       inchikey  \n",
       "0   BRMWTNUJHUMWMS-LURJTMIESA-N  \n",
       "1   XFNJVJPLKCPIBV-UHFFFAOYSA-N  \n",
       "2   TYEYBOSBBBHJIV-UHFFFAOYSA-N  \n",
       "3   AFENDNXGAFYKQO-VKHMYHEASA-N  \n",
       "4   WHEUWNKSCXYKBU-QPWUGHHJSA-N  \n",
       "5   WHBMMWSBFZVSSR-GSVOUGTGSA-N  \n",
       "6   MXHRCPNRJAMMIM-SHYZEUOFSA-N  \n",
       "7   CKTSBUTUHBMZGZ-SHYZEUOFSA-N  \n",
       "8   WHBHBVVOGNECLV-OBQKJFGGSA-N  \n",
       "9   ZESRJSPZRDMNHY-YFWFAHHUSA-N  \n",
       "10  HXACOUQIXZGNBF-UHFFFAOYSA-N  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_table('accession', 'smiles', 'inchikey', head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The above example is simple, as each field has a simple string value. The `synonyms` is an array within each record, below first we process it as an array column, i.e. each row contains an array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:13.770820Z",
     "start_time": "2023-04-24T12:32:13.737362Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accession</th>\n",
       "      <th>name</th>\n",
       "      <th>synonyms</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>[(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HMDB0000002</td>\n",
       "      <td>1,3-Diaminopropane</td>\n",
       "      <td>[1,3-Propanediamine, 1,3-Propylenediamine, Pro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HMDB0000005</td>\n",
       "      <td>2-Ketobutyric acid</td>\n",
       "      <td>[2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HMDB0000008</td>\n",
       "      <td>2-Hydroxybutyric acid</td>\n",
       "      <td>[(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HMDB0000010</td>\n",
       "      <td>2-Methoxyestrone</td>\n",
       "      <td>[2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>HMDB0000011</td>\n",
       "      <td>3-Hydroxybutyric acid</td>\n",
       "      <td>[(R)-(-)-beta-Hydroxybutyric acid, (R)-3-Hydro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>HMDB0000012</td>\n",
       "      <td>Deoxyuridine</td>\n",
       "      <td>[2-Deoxyuridine, dU, 2'-Deoxyuridine, 1-(2-Deo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>HMDB0000014</td>\n",
       "      <td>Deoxycytidine</td>\n",
       "      <td>[4-Amino-1-[(2R,4S,5R)-4-hydroxy-5-(hydroxymet...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>HMDB0000015</td>\n",
       "      <td>Cortexolone</td>\n",
       "      <td>[11-Desoxy-17-hydroxycorticosterone, Cortodoxo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>HMDB0000016</td>\n",
       "      <td>Deoxycorticosterone</td>\n",
       "      <td>[21-Hydroxy-4-pregnene-3,20-dione, 21-Hydroxyp...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>[2-Methyl-3-hydroxy-4-carboxy-5-hydroxymethylp...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      accession                   name   \n",
       "0   HMDB0000001      1-Methylhistidine  \\\n",
       "1   HMDB0000002     1,3-Diaminopropane   \n",
       "2   HMDB0000005     2-Ketobutyric acid   \n",
       "3   HMDB0000008  2-Hydroxybutyric acid   \n",
       "4   HMDB0000010       2-Methoxyestrone   \n",
       "5   HMDB0000011  3-Hydroxybutyric acid   \n",
       "6   HMDB0000012           Deoxyuridine   \n",
       "7   HMDB0000014          Deoxycytidine   \n",
       "8   HMDB0000015            Cortexolone   \n",
       "9   HMDB0000016    Deoxycorticosterone   \n",
       "10  HMDB0000017       4-Pyridoxic acid   \n",
       "\n",
       "                                             synonyms  \n",
       "0   [(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...  \n",
       "1   [1,3-Propanediamine, 1,3-Propylenediamine, Pro...  \n",
       "2   [2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...  \n",
       "3   [(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...  \n",
       "4   [2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...  \n",
       "5   [(R)-(-)-beta-Hydroxybutyric acid, (R)-3-Hydro...  \n",
       "6   [2-Deoxyuridine, dU, 2'-Deoxyuridine, 1-(2-Deo...  \n",
       "7   [4-Amino-1-[(2R,4S,5R)-4-hydroxy-5-(hydroxymet...  \n",
       "8   [11-Desoxy-17-hydroxycorticosterone, Cortodoxo...  \n",
       "9   [21-Hydroxy-4-pregnene-3,20-dione, 21-Hydroxyp...  \n",
       "10  [2-Methyl-3-hydroxy-4-carboxy-5-hydroxymethylp...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_table('accession', 'name', 'synonyms', head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each element in the column is an array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:19.643579Z",
     "start_time": "2023-04-24T12:32:19.611405Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoic acid',\n",
       " 'Pi-methylhistidine',\n",
       " '(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoate',\n",
       " '1 Methylhistidine',\n",
       " '1-Methyl histidine',\n",
       " '1-Methyl-histidine',\n",
       " '1-Methyl-L-histidine',\n",
       " '1-MHis',\n",
       " '1-N-Methyl-L-histidine',\n",
       " 'L-1-Methylhistidine',\n",
       " 'N1-Methyl-L-histidine',\n",
       " '1-Methylhistidine dihydrochloride',\n",
       " '1-Methylhistidine']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb_synonyms = hmdb.metabolites_table('accession', 'name', 'synonyms', head = 10)\n",
    "hmdb_synonyms.synonyms[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the `@` notation, the arrays can be expanded into multiple rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:25.415442Z",
     "start_time": "2023-04-24T12:32:25.368277Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accession</th>\n",
       "      <th>name</th>\n",
       "      <th>synonyms</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Pi-methylhistidine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>1 Methylhistidine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>1-Methyl histidine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>3-Hydroxy-5-hydroxymethyl-2-methyl-isonicotins...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>292</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>4 Pyridoxinic acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Pyridoxinecarboxylic acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>294</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>4 Pyridoxylic acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>4 Pyridoxic acid</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>296 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       accession               name   \n",
       "0    HMDB0000001  1-Methylhistidine  \\\n",
       "1    HMDB0000001  1-Methylhistidine   \n",
       "2    HMDB0000001  1-Methylhistidine   \n",
       "3    HMDB0000001  1-Methylhistidine   \n",
       "4    HMDB0000001  1-Methylhistidine   \n",
       "..           ...                ...   \n",
       "291  HMDB0000017   4-Pyridoxic acid   \n",
       "292  HMDB0000017   4-Pyridoxic acid   \n",
       "293  HMDB0000017   4-Pyridoxic acid   \n",
       "294  HMDB0000017   4-Pyridoxic acid   \n",
       "295  HMDB0000017   4-Pyridoxic acid   \n",
       "\n",
       "                                              synonyms  \n",
       "0    (2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...  \n",
       "1                                   Pi-methylhistidine  \n",
       "2    (2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...  \n",
       "3                                    1 Methylhistidine  \n",
       "4                                   1-Methyl histidine  \n",
       "..                                                 ...  \n",
       "291  3-Hydroxy-5-hydroxymethyl-2-methyl-isonicotins...  \n",
       "292                                 4 Pyridoxinic acid  \n",
       "293                          Pyridoxinecarboxylic acid  \n",
       "294                                 4 Pyridoxylic acid  \n",
       "295                                   4 Pyridoxic acid  \n",
       "\n",
       "[296 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_table('accession', 'name', ('synonyms', '@'), head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This already resulted almost 300 rows: be careful using `@` for multiple columns, as it yields rows in a combinatorial way, and the resulted data frames can easily grow huge. Another notation is `*`, it means extract all elements from a dict into multiple columns. Below we apply it to the `taxonomy` column which is a dict of multiple fields:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:30.272309Z",
     "start_time": "2023-04-24T12:32:30.230911Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accession</th>\n",
       "      <th>name</th>\n",
       "      <th>taxonomy__alternative_parents</th>\n",
       "      <th>taxonomy__class</th>\n",
       "      <th>taxonomy__description</th>\n",
       "      <th>taxonomy__direct_parent</th>\n",
       "      <th>taxonomy__kingdom</th>\n",
       "      <th>taxonomy__molecular_framework</th>\n",
       "      <th>taxonomy__sub_class</th>\n",
       "      <th>taxonomy__substituents</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>[Amino acids, Aralkylamines, Azacyclic compoun...</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>[Alpha-amino acid, Amine, Amino acid, Aralkyla...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HMDB0000002</td>\n",
       "      <td>1,3-Diaminopropane</td>\n",
       "      <td>[Hydrocarbon derivatives, Organopnictogen comp...</td>\n",
       "      <td>Organonitrogen compounds</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Monoalkylamines</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic acyclic compounds</td>\n",
       "      <td>Amines</td>\n",
       "      <td>[Aliphatic acyclic compound, Hydrocarbon deriv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HMDB0000005</td>\n",
       "      <td>2-Ketobutyric acid</td>\n",
       "      <td>[Alpha-hydroxy ketones, Alpha-keto acids and d...</td>\n",
       "      <td>Keto acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Short-chain keto acids and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic acyclic compounds</td>\n",
       "      <td>Short-chain keto acids and derivatives</td>\n",
       "      <td>[Aliphatic acyclic compound, Alpha-hydroxy ket...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HMDB0000008</td>\n",
       "      <td>2-Hydroxybutyric acid</td>\n",
       "      <td>[Carbonyl compounds, Carboxylic acids, Fatty a...</td>\n",
       "      <td>Hydroxy acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Alpha hydroxy acids and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic acyclic compounds</td>\n",
       "      <td>Alpha hydroxy acids and derivatives</td>\n",
       "      <td>[Alcohol, Aliphatic acyclic compound, Alpha-hy...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HMDB0000010</td>\n",
       "      <td>2-Methoxyestrone</td>\n",
       "      <td>[1-hydroxy-2-unsubstituted benzenoids, 17-oxos...</td>\n",
       "      <td>Steroids and steroid derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Estrogens and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic homopolycyclic compounds</td>\n",
       "      <td>Estrane steroids</td>\n",
       "      <td>[1-hydroxy-2-unsubstituted benzenoid, 17-oxost...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>HMDB0000011</td>\n",
       "      <td>3-Hydroxybutyric acid</td>\n",
       "      <td>[Carbonyl compounds, Carboxylic acids, Fatty a...</td>\n",
       "      <td>Hydroxy acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Beta hydroxy acids and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic acyclic compounds</td>\n",
       "      <td>Beta hydroxy acids and derivatives</td>\n",
       "      <td>[Alcohol, Aliphatic acyclic compound, Beta-hyd...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>HMDB0000012</td>\n",
       "      <td>Deoxyuridine</td>\n",
       "      <td>[Azacyclic compounds, Heteroaromatic compounds...</td>\n",
       "      <td>Pyrimidine nucleosides</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyrimidine 2'-deoxyribonucleosides</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyrimidine 2'-deoxyribonucleosides</td>\n",
       "      <td>[Alcohol, Aromatic heteromonocyclic compound, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>HMDB0000014</td>\n",
       "      <td>Deoxycytidine</td>\n",
       "      <td>[Aminopyrimidines and derivatives, Azacyclic c...</td>\n",
       "      <td>Pyrimidine nucleosides</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyrimidine 2'-deoxyribonucleosides</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyrimidine 2'-deoxyribonucleosides</td>\n",
       "      <td>[Alcohol, Amine, Aminopyrimidine, Aromatic het...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>HMDB0000015</td>\n",
       "      <td>Cortexolone</td>\n",
       "      <td>[17-hydroxysteroids, 20-oxosteroids, 3-oxo del...</td>\n",
       "      <td>Steroids and steroid derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>21-hydroxysteroids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic homopolycyclic compounds</td>\n",
       "      <td>Hydroxysteroids</td>\n",
       "      <td>[17-hydroxysteroid, 20-oxosteroid, 21-hydroxys...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>HMDB0000016</td>\n",
       "      <td>Deoxycorticosterone</td>\n",
       "      <td>[20-oxosteroids, 3-oxo delta-4-steroids, Alpha...</td>\n",
       "      <td>Steroids and steroid derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>21-hydroxysteroids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aliphatic homopolycyclic compounds</td>\n",
       "      <td>Hydroxysteroids</td>\n",
       "      <td>[20-oxosteroid, 21-hydroxysteroid, 3-oxo-delta...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>[Aromatic alcohols, Azacyclic compounds, Carbo...</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>[Alcohol, Aromatic alcohol, Aromatic heteromon...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      accession                   name   \n",
       "0   HMDB0000001      1-Methylhistidine  \\\n",
       "1   HMDB0000002     1,3-Diaminopropane   \n",
       "2   HMDB0000005     2-Ketobutyric acid   \n",
       "3   HMDB0000008  2-Hydroxybutyric acid   \n",
       "4   HMDB0000010       2-Methoxyestrone   \n",
       "5   HMDB0000011  3-Hydroxybutyric acid   \n",
       "6   HMDB0000012           Deoxyuridine   \n",
       "7   HMDB0000014          Deoxycytidine   \n",
       "8   HMDB0000015            Cortexolone   \n",
       "9   HMDB0000016    Deoxycorticosterone   \n",
       "10  HMDB0000017       4-Pyridoxic acid   \n",
       "\n",
       "                        taxonomy__alternative_parents   \n",
       "0   [Amino acids, Aralkylamines, Azacyclic compoun...  \\\n",
       "1   [Hydrocarbon derivatives, Organopnictogen comp...   \n",
       "2   [Alpha-hydroxy ketones, Alpha-keto acids and d...   \n",
       "3   [Carbonyl compounds, Carboxylic acids, Fatty a...   \n",
       "4   [1-hydroxy-2-unsubstituted benzenoids, 17-oxos...   \n",
       "5   [Carbonyl compounds, Carboxylic acids, Fatty a...   \n",
       "6   [Azacyclic compounds, Heteroaromatic compounds...   \n",
       "7   [Aminopyrimidines and derivatives, Azacyclic c...   \n",
       "8   [17-hydroxysteroids, 20-oxosteroids, 3-oxo del...   \n",
       "9   [20-oxosteroids, 3-oxo delta-4-steroids, Alpha...   \n",
       "10  [Aromatic alcohols, Azacyclic compounds, Carbo...   \n",
       "\n",
       "                     taxonomy__class   \n",
       "0   Carboxylic acids and derivatives  \\\n",
       "1           Organonitrogen compounds   \n",
       "2         Keto acids and derivatives   \n",
       "3      Hydroxy acids and derivatives   \n",
       "4   Steroids and steroid derivatives   \n",
       "5      Hydroxy acids and derivatives   \n",
       "6             Pyrimidine nucleosides   \n",
       "7             Pyrimidine nucleosides   \n",
       "8   Steroids and steroid derivatives   \n",
       "9   Steroids and steroid derivatives   \n",
       "10         Pyridines and derivatives   \n",
       "\n",
       "                                taxonomy__description   \n",
       "0    belongs to the class of organic compounds kno...  \\\n",
       "1    belongs to the class of organic compounds kno...   \n",
       "2    belongs to the class of organic compounds kno...   \n",
       "3    belongs to the class of organic compounds kno...   \n",
       "4    belongs to the class of organic compounds kno...   \n",
       "5    belongs to the class of organic compounds kno...   \n",
       "6    belongs to the class of organic compounds kno...   \n",
       "7    belongs to the class of organic compounds kno...   \n",
       "8    belongs to the class of organic compounds kno...   \n",
       "9    belongs to the class of organic compounds kno...   \n",
       "10   belongs to the class of organic compounds kno...   \n",
       "\n",
       "                   taxonomy__direct_parent  taxonomy__kingdom   \n",
       "0                Histidine and derivatives  Organic compounds  \\\n",
       "1                          Monoalkylamines  Organic compounds   \n",
       "2   Short-chain keto acids and derivatives  Organic compounds   \n",
       "3      Alpha hydroxy acids and derivatives  Organic compounds   \n",
       "4                Estrogens and derivatives  Organic compounds   \n",
       "5       Beta hydroxy acids and derivatives  Organic compounds   \n",
       "6       Pyrimidine 2'-deoxyribonucleosides  Organic compounds   \n",
       "7       Pyrimidine 2'-deoxyribonucleosides  Organic compounds   \n",
       "8                       21-hydroxysteroids  Organic compounds   \n",
       "9                       21-hydroxysteroids  Organic compounds   \n",
       "10                Pyridinecarboxylic acids  Organic compounds   \n",
       "\n",
       "          taxonomy__molecular_framework   \n",
       "0   Aromatic heteromonocyclic compounds  \\\n",
       "1           Aliphatic acyclic compounds   \n",
       "2           Aliphatic acyclic compounds   \n",
       "3           Aliphatic acyclic compounds   \n",
       "4     Aromatic homopolycyclic compounds   \n",
       "5           Aliphatic acyclic compounds   \n",
       "6   Aromatic heteromonocyclic compounds   \n",
       "7   Aromatic heteromonocyclic compounds   \n",
       "8    Aliphatic homopolycyclic compounds   \n",
       "9    Aliphatic homopolycyclic compounds   \n",
       "10  Aromatic heteromonocyclic compounds   \n",
       "\n",
       "                         taxonomy__sub_class   \n",
       "0       Amino acids, peptides, and analogues  \\\n",
       "1                                     Amines   \n",
       "2     Short-chain keto acids and derivatives   \n",
       "3        Alpha hydroxy acids and derivatives   \n",
       "4                           Estrane steroids   \n",
       "5         Beta hydroxy acids and derivatives   \n",
       "6         Pyrimidine 2'-deoxyribonucleosides   \n",
       "7         Pyrimidine 2'-deoxyribonucleosides   \n",
       "8                            Hydroxysteroids   \n",
       "9                            Hydroxysteroids   \n",
       "10  Pyridinecarboxylic acids and derivatives   \n",
       "\n",
       "                               taxonomy__substituents  \n",
       "0   [Alpha-amino acid, Amine, Amino acid, Aralkyla...  \n",
       "1   [Aliphatic acyclic compound, Hydrocarbon deriv...  \n",
       "2   [Aliphatic acyclic compound, Alpha-hydroxy ket...  \n",
       "3   [Alcohol, Aliphatic acyclic compound, Alpha-hy...  \n",
       "4   [1-hydroxy-2-unsubstituted benzenoid, 17-oxost...  \n",
       "5   [Alcohol, Aliphatic acyclic compound, Beta-hyd...  \n",
       "6   [Alcohol, Aromatic heteromonocyclic compound, ...  \n",
       "7   [Alcohol, Amine, Aminopyrimidine, Aromatic het...  \n",
       "8   [17-hydroxysteroid, 20-oxosteroid, 21-hydroxys...  \n",
       "9   [20-oxosteroid, 21-hydroxysteroid, 3-oxo-delta...  \n",
       "10  [Alcohol, Aromatic alcohol, Aromatic heteromon...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_table('accession', 'name', ('taxonomy', '*'), head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see `taxonomy` gave birth to 8 columns. If we expand all those columns, we get a data frame of more than 2,000 rows only from the first 10 records already:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:32:37.220921Z",
     "start_time": "2023-04-24T12:32:37.176017Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accession</th>\n",
       "      <th>name</th>\n",
       "      <th>taxonomy__alternative_parents</th>\n",
       "      <th>taxonomy__class</th>\n",
       "      <th>taxonomy__description</th>\n",
       "      <th>taxonomy__direct_parent</th>\n",
       "      <th>taxonomy__kingdom</th>\n",
       "      <th>taxonomy__molecular_framework</th>\n",
       "      <th>taxonomy__sub_class</th>\n",
       "      <th>taxonomy__substituents</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Amino acids</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>Alpha-amino acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Amino acids</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>Amine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Amino acids</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>Amino acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Amino acids</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>Aralkylamine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HMDB0000001</td>\n",
       "      <td>1-Methylhistidine</td>\n",
       "      <td>Amino acids</td>\n",
       "      <td>Carboxylic acids and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Histidine and derivatives</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Amino acids, peptides, and analogues</td>\n",
       "      <td>Aromatic heteromonocyclic compound</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2235</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Vinylogous acids</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>Organooxygen compound</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2236</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Vinylogous acids</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>Organopnictogen compound</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2237</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Vinylogous acids</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>Primary alcohol</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2238</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Vinylogous acids</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>Pyridine carboxylic acid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2239</th>\n",
       "      <td>HMDB0000017</td>\n",
       "      <td>4-Pyridoxic acid</td>\n",
       "      <td>Vinylogous acids</td>\n",
       "      <td>Pyridines and derivatives</td>\n",
       "      <td>belongs to the class of organic compounds kno...</td>\n",
       "      <td>Pyridinecarboxylic acids</td>\n",
       "      <td>Organic compounds</td>\n",
       "      <td>Aromatic heteromonocyclic compounds</td>\n",
       "      <td>Pyridinecarboxylic acids and derivatives</td>\n",
       "      <td>Vinylogous acid</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2240 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        accession               name taxonomy__alternative_parents   \n",
       "0     HMDB0000001  1-Methylhistidine                   Amino acids  \\\n",
       "1     HMDB0000001  1-Methylhistidine                   Amino acids   \n",
       "2     HMDB0000001  1-Methylhistidine                   Amino acids   \n",
       "3     HMDB0000001  1-Methylhistidine                   Amino acids   \n",
       "4     HMDB0000001  1-Methylhistidine                   Amino acids   \n",
       "...           ...                ...                           ...   \n",
       "2235  HMDB0000017   4-Pyridoxic acid              Vinylogous acids   \n",
       "2236  HMDB0000017   4-Pyridoxic acid              Vinylogous acids   \n",
       "2237  HMDB0000017   4-Pyridoxic acid              Vinylogous acids   \n",
       "2238  HMDB0000017   4-Pyridoxic acid              Vinylogous acids   \n",
       "2239  HMDB0000017   4-Pyridoxic acid              Vinylogous acids   \n",
       "\n",
       "                       taxonomy__class   \n",
       "0     Carboxylic acids and derivatives  \\\n",
       "1     Carboxylic acids and derivatives   \n",
       "2     Carboxylic acids and derivatives   \n",
       "3     Carboxylic acids and derivatives   \n",
       "4     Carboxylic acids and derivatives   \n",
       "...                                ...   \n",
       "2235         Pyridines and derivatives   \n",
       "2236         Pyridines and derivatives   \n",
       "2237         Pyridines and derivatives   \n",
       "2238         Pyridines and derivatives   \n",
       "2239         Pyridines and derivatives   \n",
       "\n",
       "                                  taxonomy__description   \n",
       "0      belongs to the class of organic compounds kno...  \\\n",
       "1      belongs to the class of organic compounds kno...   \n",
       "2      belongs to the class of organic compounds kno...   \n",
       "3      belongs to the class of organic compounds kno...   \n",
       "4      belongs to the class of organic compounds kno...   \n",
       "...                                                 ...   \n",
       "2235   belongs to the class of organic compounds kno...   \n",
       "2236   belongs to the class of organic compounds kno...   \n",
       "2237   belongs to the class of organic compounds kno...   \n",
       "2238   belongs to the class of organic compounds kno...   \n",
       "2239   belongs to the class of organic compounds kno...   \n",
       "\n",
       "        taxonomy__direct_parent  taxonomy__kingdom   \n",
       "0     Histidine and derivatives  Organic compounds  \\\n",
       "1     Histidine and derivatives  Organic compounds   \n",
       "2     Histidine and derivatives  Organic compounds   \n",
       "3     Histidine and derivatives  Organic compounds   \n",
       "4     Histidine and derivatives  Organic compounds   \n",
       "...                         ...                ...   \n",
       "2235   Pyridinecarboxylic acids  Organic compounds   \n",
       "2236   Pyridinecarboxylic acids  Organic compounds   \n",
       "2237   Pyridinecarboxylic acids  Organic compounds   \n",
       "2238   Pyridinecarboxylic acids  Organic compounds   \n",
       "2239   Pyridinecarboxylic acids  Organic compounds   \n",
       "\n",
       "            taxonomy__molecular_framework   \n",
       "0     Aromatic heteromonocyclic compounds  \\\n",
       "1     Aromatic heteromonocyclic compounds   \n",
       "2     Aromatic heteromonocyclic compounds   \n",
       "3     Aromatic heteromonocyclic compounds   \n",
       "4     Aromatic heteromonocyclic compounds   \n",
       "...                                   ...   \n",
       "2235  Aromatic heteromonocyclic compounds   \n",
       "2236  Aromatic heteromonocyclic compounds   \n",
       "2237  Aromatic heteromonocyclic compounds   \n",
       "2238  Aromatic heteromonocyclic compounds   \n",
       "2239  Aromatic heteromonocyclic compounds   \n",
       "\n",
       "                           taxonomy__sub_class   \n",
       "0         Amino acids, peptides, and analogues  \\\n",
       "1         Amino acids, peptides, and analogues   \n",
       "2         Amino acids, peptides, and analogues   \n",
       "3         Amino acids, peptides, and analogues   \n",
       "4         Amino acids, peptides, and analogues   \n",
       "...                                        ...   \n",
       "2235  Pyridinecarboxylic acids and derivatives   \n",
       "2236  Pyridinecarboxylic acids and derivatives   \n",
       "2237  Pyridinecarboxylic acids and derivatives   \n",
       "2238  Pyridinecarboxylic acids and derivatives   \n",
       "2239  Pyridinecarboxylic acids and derivatives   \n",
       "\n",
       "                  taxonomy__substituents  \n",
       "0                       Alpha-amino acid  \n",
       "1                                  Amine  \n",
       "2                             Amino acid  \n",
       "3                           Aralkylamine  \n",
       "4     Aromatic heteromonocyclic compound  \n",
       "...                                  ...  \n",
       "2235               Organooxygen compound  \n",
       "2236            Organopnictogen compound  \n",
       "2237                     Primary alcohol  \n",
       "2238            Pyridine carboxylic acid  \n",
       "2239                     Vinylogous acid  \n",
       "\n",
       "[2240 rows x 10 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_table('accession', 'name', ('taxonomy', '*', '@'), head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `hmdb.metabolites_mapping` and `hmdb.proteins_mapping` function provides data frames or dicts for translation between a pair of identifier types. For example, translate KEGG Pathway IDs to SMILES, default output is dict of sets:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:27.432669Z",
     "start_time": "2023-04-24T12:33:27.392854Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C00109': {'CCC(=O)C(O)=O'},\n",
       " 'C00526': {'OC[C@H]1O[C@H](C[C@@H]1O)N1C=CC(=O)NC1=O'},\n",
       " 'C00847': {'CC1=NC=C(CO)C(C(O)=O)=C1O'},\n",
       " 'C00881': {'NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](CO)O1'},\n",
       " 'C00986': {'NCCCN'},\n",
       " 'C01089': {'C[C@@H](O)CC(O)=O'},\n",
       " 'C01152': {'CN1C=NC(C[C@H](N)C(O)=O)=C1'},\n",
       " 'C03205': {'[H][C@@]12CC[C@H](C(=O)CO)[C@@]1(C)CC[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[C@]12C'},\n",
       " 'C05299': {'[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[C@@]21[H])C=C(O)C(OC)=C3'},\n",
       " 'C05488': {'[H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)CC[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[C@]12C'},\n",
       " 'C05984': {'CC[C@H](O)C(O)=O'}}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_mapping('kegg', 'smiles', head = 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The same data in a data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:31.913985Z",
     "start_time": "2023-04-24T12:33:31.872434Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_a</th>\n",
       "      <th>id_b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C01152</td>\n",
       "      <td>CN1C=NC(C[C@H](N)C(O)=O)=C1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C00986</td>\n",
       "      <td>NCCCN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C00109</td>\n",
       "      <td>CCC(=O)C(O)=O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C05984</td>\n",
       "      <td>CC[C@H](O)C(O)=O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C05299</td>\n",
       "      <td>[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>C01089</td>\n",
       "      <td>C[C@@H](O)CC(O)=O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>C00526</td>\n",
       "      <td>OC[C@H]1O[C@H](C[C@@H]1O)N1C=CC(=O)NC1=O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>C00881</td>\n",
       "      <td>NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](CO)O1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>C05488</td>\n",
       "      <td>[H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)CC[C@@]1(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>C03205</td>\n",
       "      <td>[H][C@@]12CC[C@H](C(=O)CO)[C@@]1(C)CC[C@@]1([H...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>C00847</td>\n",
       "      <td>CC1=NC=C(CO)C(C(O)=O)=C1O</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id_a                                               id_b\n",
       "0   C01152                        CN1C=NC(C[C@H](N)C(O)=O)=C1\n",
       "1   C00986                                              NCCCN\n",
       "2   C00109                                      CCC(=O)C(O)=O\n",
       "3   C05984                                   CC[C@H](O)C(O)=O\n",
       "4   C05299  [H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...\n",
       "5   C01089                                  C[C@@H](O)CC(O)=O\n",
       "6   C00526           OC[C@H]1O[C@H](C[C@@H]1O)N1C=CC(=O)NC1=O\n",
       "7   C00881       NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](CO)O1\n",
       "8   C05488  [H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)CC[C@@]1(...\n",
       "9   C03205  [H][C@@]12CC[C@H](C(=O)CO)[C@@]1(C)CC[C@@]1([H...\n",
       "10  C00847                          CC1=NC=C(CO)C(C(O)=O)=C1O"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hmdb.metabolites_mapping('kegg', 'smiles', head = 10, return_df = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### ID translation with HMDB\n",
    "\n",
    "HMDB is also integrated into the ID translation service. Thanks to the multiple levels of caching, only the first call takes long time, subsequent calls are pretty fast:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:39.082322Z",
     "start_time": "2023-04-24T12:33:38.913124Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11)12/h3-4,6H,2,8H2,1H3,(H,11,12)/t6-/m0/s1',\n",
       " 'InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11)12/h3-4,6H,2,8H2,1H3,(H,11,12)/t6-/m0/s1'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import mapping\n",
    "mapping.map_name('C01152', 'kegg', 'inchi')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The two InChi Keys correspond to the two constitutional isomers included in the KEGG ID: 1- and 3-Methylhistidine. A useful feature of HMDB that it has many synonyms and IUPAC names, making it possible to parse a large variety of metabolite names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:41.763796Z",
     "start_time": "2023-04-24T12:33:41.727916Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoate',\n",
       " '(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoic acid',\n",
       " '(2S)-2-Amino-3-(1-methyl-1H-imidazol-5-yl)propanoate',\n",
       " '(2S)-2-Amino-3-(1-methyl-1H-imidazol-5-yl)propanoic acid',\n",
       " '1 Methylhistidine',\n",
       " '1-MHis',\n",
       " '1-Methyl histidine',\n",
       " '1-Methyl-L-histidine',\n",
       " '1-Methyl-histidine',\n",
       " '1-Methylhistidine',\n",
       " '1-Methylhistidine dihydrochloride',\n",
       " '1-N-Methyl-L-histidine',\n",
       " '3-Methyl-L-histidine',\n",
       " '3-Methylhistidine',\n",
       " '3-Methylhistidine dihydrochloride',\n",
       " '3-Methylhistidine hydride',\n",
       " '3-N-Methyl-L-histidine',\n",
       " 'L-1-Methylhistidine',\n",
       " 'L-3-Methylhistidine',\n",
       " 'N Tau-methylhistidine',\n",
       " 'N(Tau)-methylhistidine',\n",
       " 'N(pros)-Methyl-L-histidine',\n",
       " 'N-pros-Methyl-L-histidine',\n",
       " 'N1-Methyl-L-histidine',\n",
       " 'N3-Methyl-L-histidine',\n",
       " 'Pi-methylhistidine',\n",
       " 'Tau-methyl-L-histidine',\n",
       " 'Tau-methylhistidine'}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name('C01152', 'kegg', 'hmdb_synonym')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:46.320639Z",
     "start_time": "2023-04-24T12:33:44.509286Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11)12/h3-4,6H,2,8H2,1H3,(H,11,12)/t6-/m0/s1'}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name('N(pros)-Methyl-L-histidine', 'hmdb_synonym', 'inchi')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `name` provided by HMDB is typically the best human readable name, hence it can be used as labels in figures or tables:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-24T12:33:47.028648Z",
     "start_time": "2023-04-24T12:33:46.531782Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'1-Methylhistidine'}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name('HMDB0000001', 'hmdb', 'hmdb_name')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SwissLipids\n",
    "\n",
    "The `pypath.inputs.swisslipids` module provides access to the [datasets available from SwissLipids for download](https://swisslipids.org/#/downloads). Each function returns a `csv.DictReader`, which is a generator that yields rows as dicts:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:38:03.138600Z",
     "start_time": "2024-10-06T19:38:03.136178Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.inputs import swisslipids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:37:12.595396Z",
     "start_time": "2024-10-06T19:37:12.374918Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<csv.DictReader at 0x6a4f241d0230>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tissues = swisslipids.swisslipids_tissues()\n",
    "tissues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:37:38.091410Z",
     "start_time": "2024-10-06T19:37:38.087363Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Lipid ID': 'SLM:000056561',\n",
       " 'Lipid name': 'Phosphatidylcholine (40:6)',\n",
       " 'Tissue/Cell ID': 'UBERON:0001969',\n",
       " 'Tissue/Cell name': 'blood plasma',\n",
       " 'Taxon ID': '9606',\n",
       " 'Taxon scientific name': 'Homo sapiens',\n",
       " 'Evidence tag ID': '6814'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(tissues)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, the datasets can be retrieved as data frames by the `return_df` argument. The \"lipids\" and \"lipids2uniprot\" datasets use a large amount of memory if loaded this way."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:40:23.152555Z",
     "start_time": "2024-10-06T19:40:23.036048Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Lipid ID</th>\n",
       "      <th>Lipid name</th>\n",
       "      <th>Tissue/Cell ID</th>\n",
       "      <th>Tissue/Cell name</th>\n",
       "      <th>Taxon ID</th>\n",
       "      <th>Taxon scientific name</th>\n",
       "      <th>Evidence tag ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SLM:000056561</td>\n",
       "      <td>Phosphatidylcholine (40:6)</td>\n",
       "      <td>UBERON:0001969</td>\n",
       "      <td>blood plasma</td>\n",
       "      <td>9606</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>6814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>SLM:000056510</td>\n",
       "      <td>Phosphatidylcholine (34:3)</td>\n",
       "      <td>UBERON:0001969</td>\n",
       "      <td>blood plasma</td>\n",
       "      <td>9606</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>6806</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SLM:000056525</td>\n",
       "      <td>Phosphatidylcholine (36:4)</td>\n",
       "      <td>UBERON:0001969</td>\n",
       "      <td>blood plasma</td>\n",
       "      <td>9606</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>6809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SLM:000056524</td>\n",
       "      <td>Phosphatidylcholine (36:3)</td>\n",
       "      <td>UBERON:0001969</td>\n",
       "      <td>blood plasma</td>\n",
       "      <td>9606</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>6808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SLM:000056509</td>\n",
       "      <td>Phosphatidylcholine (34:2)</td>\n",
       "      <td>UBERON:0001969</td>\n",
       "      <td>blood plasma</td>\n",
       "      <td>9606</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>6805</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>934</th>\n",
       "      <td>SLM:000098542</td>\n",
       "      <td>Phosphatidylethanolamine (O-18:0/16:0)</td>\n",
       "      <td>UBERON:0000468</td>\n",
       "      <td>multi-cellular organism</td>\n",
       "      <td>6239</td>\n",
       "      <td>Caenorhabditis elegans</td>\n",
       "      <td>15918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>935</th>\n",
       "      <td>SLM:000098543</td>\n",
       "      <td>Phosphatidylethanolamine (O-18:0/16:1)</td>\n",
       "      <td>UBERON:0000468</td>\n",
       "      <td>multi-cellular organism</td>\n",
       "      <td>6239</td>\n",
       "      <td>Caenorhabditis elegans</td>\n",
       "      <td>15917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>936</th>\n",
       "      <td>SLM:000098546</td>\n",
       "      <td>Phosphatidylethanolamine (O-18:0/18:0)</td>\n",
       "      <td>UBERON:0000468</td>\n",
       "      <td>multi-cellular organism</td>\n",
       "      <td>6239</td>\n",
       "      <td>Caenorhabditis elegans</td>\n",
       "      <td>15916</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>937</th>\n",
       "      <td>SLM:000098549</td>\n",
       "      <td>Phosphatidylethanolamine (O-18:0/18:3)</td>\n",
       "      <td>UBERON:0000468</td>\n",
       "      <td>multi-cellular organism</td>\n",
       "      <td>6239</td>\n",
       "      <td>Caenorhabditis elegans</td>\n",
       "      <td>15913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>938</th>\n",
       "      <td>SLM:000098557</td>\n",
       "      <td>Phosphatidylethanolamine (O-18:0/20:5)</td>\n",
       "      <td>UBERON:0000468</td>\n",
       "      <td>multi-cellular organism</td>\n",
       "      <td>6239</td>\n",
       "      <td>Caenorhabditis elegans</td>\n",
       "      <td>15910</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>939 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Lipid ID                              Lipid name  Tissue/Cell ID  \\\n",
       "0    SLM:000056561              Phosphatidylcholine (40:6)  UBERON:0001969   \n",
       "1    SLM:000056510              Phosphatidylcholine (34:3)  UBERON:0001969   \n",
       "2    SLM:000056525              Phosphatidylcholine (36:4)  UBERON:0001969   \n",
       "3    SLM:000056524              Phosphatidylcholine (36:3)  UBERON:0001969   \n",
       "4    SLM:000056509              Phosphatidylcholine (34:2)  UBERON:0001969   \n",
       "..             ...                                     ...             ...   \n",
       "934  SLM:000098542  Phosphatidylethanolamine (O-18:0/16:0)  UBERON:0000468   \n",
       "935  SLM:000098543  Phosphatidylethanolamine (O-18:0/16:1)  UBERON:0000468   \n",
       "936  SLM:000098546  Phosphatidylethanolamine (O-18:0/18:0)  UBERON:0000468   \n",
       "937  SLM:000098549  Phosphatidylethanolamine (O-18:0/18:3)  UBERON:0000468   \n",
       "938  SLM:000098557  Phosphatidylethanolamine (O-18:0/20:5)  UBERON:0000468   \n",
       "\n",
       "            Tissue/Cell name  Taxon ID   Taxon scientific name Evidence tag ID  \n",
       "0               blood plasma      9606            Homo sapiens            6814  \n",
       "1               blood plasma      9606            Homo sapiens            6806  \n",
       "2               blood plasma      9606            Homo sapiens            6809  \n",
       "3               blood plasma      9606            Homo sapiens            6808  \n",
       "4               blood plasma      9606            Homo sapiens            6805  \n",
       "..                       ...       ...                     ...             ...  \n",
       "934  multi-cellular organism      6239  Caenorhabditis elegans           15918  \n",
       "935  multi-cellular organism      6239  Caenorhabditis elegans           15917  \n",
       "936  multi-cellular organism      6239  Caenorhabditis elegans           15916  \n",
       "937  multi-cellular organism      6239  Caenorhabditis elegans           15913  \n",
       "938  multi-cellular organism      6239  Caenorhabditis elegans           15910  \n",
       "\n",
       "[939 rows x 7 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "swisslipids.swisslipids_tissues(return_df = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LIPID MAPS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "LIPID MAPS is an international non-profit consortium that develops and maintains standards and tools for lipid research. Currently `pypath` features a client for its Structure Database, called LMSD. Pypath uses the SDF format, which includes all fields available in the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:47:28.860188Z",
     "start_time": "2024-10-06T19:47:28.845499Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.inputs import lipidmaps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the function returns, the file is already downloaded and opened, but not parsed yet, hence the object reports 0 records:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:47:47.382607Z",
     "start_time": "2024-10-06T19:47:46.094224Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<SDF file `structures.sdf`: 0 records>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lmsd = lipidmaps.lmsd_sdf()\n",
    "lmsd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One option to retrieve the records is to simply iterate the object:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:51:42.042957Z",
     "start_time": "2024-10-06T19:51:42.038605Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'LMFA00000001',\n",
       " 'source': 'LIPID_MAPS_STRUCTURE_DATABASE',\n",
       " 'comment': '',\n",
       " 'mol': '',\n",
       " 'name': {'LM_ID': 'LMFA00000001',\n",
       "  'SYSTEMATIC_NAME': '2-methoxy-12-methyloctadec-17-en-5-ynoyl anhydride',\n",
       "  'FORMULA': 'C40H66O5',\n",
       "  'INCHI_KEY': 'VOGBKCAANIAXCI-UHFFFAOYSA-N',\n",
       "  'INCHI': 'InChI=1S/C40H66O5/c1-7-9-11-23-29-35(3)31-25-19-15-13-17-21-27-33-37(43-5)39(41)45-40(42)38(44-6)34-28-22-18-14-16-20-26-32-36(4)30-24-12-10-8-2/h7-8,35-38H,1-2,9-16,19-20,23-34H2,3-6H3',\n",
       "  'SMILES': 'C(C(OC)CCC#CCCCCCC(C)CCCCC=C)(=O)OC(C(OC)CCC#CCCCCCC(C)CCCCC=C)=O',\n",
       "  'ABBREVIATION': 'FA 40:7;O3',\n",
       "  'SYNONYMS': 'Acetylenic acids',\n",
       "  'PUBCHEM_CID': '10930192',\n",
       "  'CHEBI_ID': '178363'},\n",
       " 'annot': {'NAME': '2-methoxy-12-methyloctadec-17-en-5-ynoyl anhydride',\n",
       "  'CATEGORY': 'Fatty Acyls [FA]',\n",
       "  'MAIN_CLASS': 'Other Fatty Acyls [FA00]',\n",
       "  'EXACT_MASS': '626.491025'}}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for lipid in lmsd:\n",
    "    break\n",
    "lipid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The same object is able to index the SDF file, and retrieve records on demand. The indexing covers all names, synonyms and identifiers used in the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:54:26.241245Z",
     "start_time": "2024-10-06T19:54:01.933610Z"
    }
   },
   "outputs": [],
   "source": [
    "lmsd.index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After indexing, the database shows its size:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:55:54.257578Z",
     "start_time": "2024-10-06T19:55:54.254127Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<SDF file `structures.sdf`: 48116 records>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lmsd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:56:03.624333Z",
     "start_time": "2024-10-06T19:56:03.620934Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48116"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lmsd)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The records can be retrieved by any of their names or identifiers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:54:52.568216Z",
     "start_time": "2024-10-06T19:54:52.557740Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'id': 'LMFA00000001',\n",
       "   'source': 'LIPID_MAPS_STRUCTURE_DATABASE',\n",
       "   'comment': '',\n",
       "   'mol': '',\n",
       "   'name': {'LM_ID': 'LMFA00000001',\n",
       "    'SYSTEMATIC_NAME': '2-methoxy-12-methyloctadec-17-en-5-ynoyl anhydride',\n",
       "    'FORMULA': 'C40H66O5',\n",
       "    'INCHI_KEY': 'VOGBKCAANIAXCI-UHFFFAOYSA-N',\n",
       "    'INCHI': 'InChI=1S/C40H66O5/c1-7-9-11-23-29-35(3)31-25-19-15-13-17-21-27-33-37(43-5)39(41)45-40(42)38(44-6)34-28-22-18-14-16-20-26-32-36(4)30-24-12-10-8-2/h7-8,35-38H,1-2,9-16,19-20,23-34H2,3-6H3',\n",
       "    'SMILES': 'C(C(OC)CCC#CCCCCCC(C)CCCCC=C)(=O)OC(C(OC)CCC#CCCCCCC(C)CCCCC=C)=O',\n",
       "    'ABBREVIATION': 'FA 40:7;O3',\n",
       "    'SYNONYMS': 'Acetylenic acids',\n",
       "    'PUBCHEM_CID': '10930192',\n",
       "    'CHEBI_ID': '178363'},\n",
       "   'annot': {'NAME': '2-methoxy-12-methyloctadec-17-en-5-ynoyl anhydride',\n",
       "    'CATEGORY': 'Fatty Acyls [FA]',\n",
       "    'MAIN_CLASS': 'Other Fatty Acyls [FA00]',\n",
       "    'EXACT_MASS': '626.491025'}},\n",
       "  0),\n",
       " ({'id': 'LMFA00000001',\n",
       "   'source': 'LIPID_MAPS_STRUCTURE_DATABASE',\n",
       "   'comment': '',\n",
       "   'mol':"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1803 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lmsd['LMFA00000001']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And it also supports the `in` operator:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:57:02.634280Z",
     "start_time": "2024-10-06T19:57:02.630626Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'PC(18:1/18:0)' in lmsd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T19:57:28.157218Z",
     "start_time": "2024-10-06T19:57:27.206190Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'id': 'LMGP01010888',\n",
       "   'source': 'LIPID_MAPS_STRUCTURE_DATABASE',\n",
       "   'comment': '',\n",
       "   'mol': '',\n",
       "   'name': {'LM_ID': 'LMGP01010888',\n",
       "    'SYSTEMATIC_NAME': '1-(9Z-octadecenoyl)-2-octadecanoyl-sn-glycero-3-phosphocholine',\n",
       "    'FORMULA': 'C44H86NO8P',\n",
       "    'INCHI_KEY': 'NMJCSTNQFYPVOR-VHONOUADSA-N',\n",
       "    'INCHI': 'InChI=1S/C44H86NO8P/c1-6-8-10-12-14-16-18-20-22-24-26-28-30-32-34-36-43(46)50-40-42(41-52-54(48,49)51-39-38-45(3,4)5)53-44(47)37-35-33-31-29-27-25-23-21-19-17-15-13-11-9-7-2/h20,22,42H,6-19,21,23-41H2,1-5H3/b22-20-/t42-/m1/s1',\n",
       "    'SMILES': '[C@](COP(=O)([O-])OCC[N+](C)(C)C)([H])(OC(CCCCCCCCCCCCCCCCC)=O)COC(CCCCCCC/C=C\\\\CCCCCCCC)=O',\n",
       "    'ABBREVIATION': 'PC 36:1',\n",
       "    'SYNONYMS': 'Choline phosphate, 3-ester with L-1-oleo-2-stearin; L-1-Oleoyl-2-stearoyl lecithin; L-1-Oleoyl-2-stearoyl-3-phosphatidylcholine; OSPC; PC(18:1/18:0); PC(36:1); PC(18:0_18:1)',\n",
       "    'PUBCHEM_CID': '24778936',\n",
       "    'HMDB_ID': 'HMDB0008102',\n",
       "    'CHEBI_ID': '76073',\n",
       "    'SWISSLIPIDS_ID': 'SLM:000012"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 2352 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lmsd['PC(18:1/18:0)']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, the records can be loaded into memory, in this case their retrieval is faster:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T20:02:08.827985Z",
     "start_time": "2024-10-06T20:02:08.825444Z"
    }
   },
   "outputs": [],
   "source": [
    "lmsd.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-06T20:02:51.708427Z",
     "start_time": "2024-10-06T20:02:51.165683Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': 'LMGP01010888',\n",
       "  'source': 'LIPID_MAPS_STRUCTURE_DATABASE',\n",
       "  'comment': '',\n",
       "  'mol': '',\n",
       "  'name': {'LM_ID': 'LMGP01010888',\n",
       "   'SYSTEMATIC_NAME': '1-(9Z-octadecenoyl)-2-octadecanoyl-sn-glycero-3-phosphocholine',\n",
       "   'FORMULA': 'C44H86NO8P',\n",
       "   'INCHI_KEY': 'NMJCSTNQFYPVOR-VHONOUADSA-N',\n",
       "   'INCHI': 'InChI=1S/C44H86NO8P/c1-6-8-10-12-14-16-18-20-22-24-26-28-30-32-34-36-43(46)50-40-42(41-52-54(48,49)51-39-38-45(3,4)5)53-44(47)37-35-33-31-29-27-25-23-21-19-17-15-13-11-9-7-2/h20,22,42H,6-19,21,23-41H2,1-5H3/b22-20-/t42-/m1/s1',\n",
       "   'SMILES': '[C@](COP(=O)([O-])OCC[N+](C)(C)C)([H])(OC(CCCCCCCCCCCCCCCCC)=O)COC(CCCCCCC/C=C\\\\CCCCCCCC)=O',\n",
       "   'ABBREVIATION': 'PC 36:1',\n",
       "   'SYNONYMS': 'L-1-Oleoyl-2-stearoyl-3-phosphatidylcholine;PC(36:1);PC(18:0_18:1);PC(18:1/18:0);Choline phosphate, 3-ester with L-1-oleo-2-stearin;OSPC;L-1-Oleoyl-2-stearoyl lecithin',\n",
       "   'PUBCHEM_CID': '24778936',\n",
       "   'HMDB_ID': 'HMDB0008102',\n",
       "   'CHEBI_ID': '76073',\n",
       "   'SWISSLIPIDS_ID': 'SLM:000012332'},\n",
       "  'annot': {'NA"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 2290 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lmsd['PC(18:1/18:0)']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NCBI E-Utils\n",
    "\n",
    "The [ESummary endpoint](https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary) of the NCBI E-Utils API provides metadata about records in NCBI databases. A client to this API endpoint is available in the `pypath.inputs.eutils` module. The parameter `ids` can be one integer, or a list of integers or strings:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-14T22:43:56.559547Z",
     "start_time": "2023-11-14T22:43:56.545275Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'uids': ['6063'],\n",
       " '6063': {'uid': '6063',\n",
       "  'gds': '5',\n",
       "  'gpl': '13',\n",
       "  'erank': '8eSiQ',\n",
       "  'evalue': 'joAzE',\n",
       "  'title': 'Diurnal and circadian-regulated genes (I)',\n",
       "  'taxon': 'Arabidopsis thaliana',\n",
       "  'gdstype': 'Expression profiling by array',\n",
       "  'valtype': 'log ratio',\n",
       "  'idref': '6063',\n",
       "  'genename': '',\n",
       "  'genedesc': '',\n",
       "  'ugname': 'AT4G11560',\n",
       "  'ugdesc': 'Bromo-adjacent homology (BAH) domain-containing protein',\n",
       "  'nucdesc': '9366 Lambda-PRL2 Arabidopsis thaliana cDNA clone 135J10T7, mRNA sequence',\n",
       "  'entrez_gene_id': '',\n",
       "  'gbacc': 'T46103',\n",
       "  'ptacc': '',\n",
       "  'cloneid': '135J10T7',\n",
       "  'orf': '',\n",
       "  'spotid': '',\n",
       "  'vmin': '-0.395000',\n",
       "  'vmax': '0.201000',\n",
       "  'groups': 'A1B3C1',\n",
       "  'abscall': '',\n",
       "  'aflag': 20,\n",
       "  'aoutl': '',\n",
       "  'rstd': 31,\n",
       "  'rmean': 50}}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import eutils\n",
    "\n",
    "eutils.esummary(ids = 6063, db = 'geoprofiles')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A simple wrapper for PubMed is available in the `pypath.inputs.pubmed` module:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-14T22:42:02.366626Z",
     "start_time": "2023-11-14T22:42:02.310376Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'uids': ['33209674'],\n",
       " '33209674': {'uid': '33209674',\n",
       "  'pubdate': '2020 Oct',\n",
       "  'epubdate': '',\n",
       "  'source': 'Transl Androl Urol',\n",
       "  'authors': [{'name': 'Kim H', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Lee SH', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Kim DH', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Lee JY', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Hong SH', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Ha US', 'authtype': 'Author', 'clusterid': ''},\n",
       "   {'name': 'Kim IH', 'authtype': 'Author', 'clusterid': ''}],\n",
       "  'lastauthor': 'Kim IH',\n",
       "  'title': 'Gemcitabine maintenance versus observation after first-line chemotherapy in patients with metastatic urothelial carcinoma: a retrospective study.',\n",
       "  'sorttitle': 'gemcitabine maintenance versus observation after first line chemotherapy in patients with metastatic urothelial carcinoma a retrospective study',\n",
       "  'volume': '9',\n",
       "  'issue': '5',\n",
       "  'pages': '2113-2121',\n",
       "  'lang': ['eng']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 2263 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import pubmed\n",
    "\n",
    "pubmed.get_pubmeds('33209674')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One last example, querying the Entrez Gene database:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-14T22:48:09.354813Z",
     "start_time": "2023-11-14T22:48:09.338962Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'uids': ['1956'],\n",
       " '1956': {'uid': '1956',\n",
       "  'name': 'EGFR',\n",
       "  'description': 'epidermal growth factor receptor',\n",
       "  'status': '',\n",
       "  'currentid': '',\n",
       "  'chromosome': '7',\n",
       "  'geneticsource': 'genomic',\n",
       "  'maplocation': '7p11.2',\n",
       "  'otheraliases': 'ERBB, ERBB1, ERRP, HER1, NISBD2, PIG61, mENA',\n",
       "  'otherdesignations': 'epidermal growth factor receptor|EGFR vIII|avian erythroblastic leukemia viral (v-erb-b) oncogene homolog|cell growth inhibiting protein 40|cell proliferation-inducing protein 61|epidermal growth factor receptor tyrosine kinase domain|erb-b2 receptor tyrosine kinase 1|proto-oncogene c-ErbB-1|receptor tyrosine-protein kinase erbB-1',\n",
       "  'nomenclaturesymbol': 'EGFR',\n",
       "  'nomenclaturename': 'epidermal growth factor receptor',\n",
       "  'nomenclaturestatus': 'Official',\n",
       "  'mim': ['131550'],\n",
       "  'genomicinfo': [{'chrloc': '7',\n",
       "    'chraccver': 'NC_000007.14',\n",
       "    'chrstart': 55019016,\n",
       "    'chrstop': 55211627,\n",
       "    'exoncount': 32}],\n",
       "  'geneweight': 580393,\n",
       "  'summary': 'The protein encoded b"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 5417 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import eutils\n",
    "\n",
    "eutils.esummary(ids = 1956, db = 'gene')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download management\n",
    "\n",
    "### Cache management and customization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `pypath.omnipath.app` saves the databases to pickle dumps by default under the `~/.pypath/pickles/` directory and after the first build loads them from there. The very first build of each database might take quite long time (up to >90 min in case of the OmniPath network or annotations) because of the large number of downloads. Subsequent builds will be much faster because `pypath` stores all the downloaded data in a local cache and downloads again only upon request from the user. Loading the databases from pickle dumps takes only seconds. However if you want to build with different settings you should be aware to set up a different cache file name."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download failures\n",
    "\n",
    "Issuing hundreds of requests to dozens of servers sooner or later comes with failures. These might happen just by accident, especially on slow networks, it is always recommended to try again. The \n",
    "\n",
    "#### Corrupted cache content\n",
    "\n",
    "Sometimes a truncated or corrupted file remains in the cache, in this case you can use the context managers in ``pypath.share.curl`` to control the cache. E.g. if the download of the *DEPOD* database failed and keeps failing due to a corrupted file, use the ``cache_delete_on`` context:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:07.927339Z",
     "start_time": "2022-12-02T13:59:02.319604Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.share import curl\n",
    "from pypath.inputs import depod\n",
    "\n",
    "with curl.cache_delete_on():\n",
    "    depod = depod.depod_enzyme_substrate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The ``cache_off`` context forces download even if a cache item is available; the ``cache_print_on`` context prints paths to the accessed cache files to the terminal, though the paths can always be found in the log; the ``dry_run_on`` context sets up the ``pypath.share.curl.Curl`` object and stops just before the actual download.\n",
    "\n",
    "#### Network communication issues: look into the curl debug log\n",
    "\n",
    "Downloads might fail also due to TLS or HTTP errors, wrong headers or parameters, and many other reasons. In this case a full debug output from `curl` might be very useful. The ``debug_on`` context writes curl debug into the logfile:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:12.632324Z",
     "start_time": "2022-12-02T13:59:12.592543Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.share import curl\n",
    "from pypath.inputs import depod\n",
    "\n",
    "with curl.debug_on():\n",
    "    depod = depod.depod_enzyme_substrate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Timeouts\n",
    "\n",
    "From the log we can find out if the download fails due to a timeout. In this case, the timeout parameters can be altered by a settings context. Apart from a timeout for the completion of the download, there is ``curl_connect_timeout`` (timeout for establishing connection to the server), and ``curl_extended_timeout``, that is used for servers that are known to be exceptionally slow. Another parameter, ``curl_retries`` is the number of attempts before giving up. By default it's 3, and that should be more than enough."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:17.428379Z",
     "start_time": "2022-12-02T13:59:17.392291Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.share import settings\n",
    "from pypath.inputs import depod\n",
    "\n",
    "with settings.context(curl_timeout = 360):\n",
    "    depod = depod.depod_enzyme_substrate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Access and inspect the ``Curl`` object\n",
    "\n",
    "Often the ``Curl`` object is created in a function from the ``pypath.inputs`` module, deep in a call stack, hence accessing it for investigation is difficult. Using the ``preserve_on`` context, the last ``Curl`` instance is kept under the ``pypath.share.curl.LASTCURL`` attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:24.683382Z",
     "start_time": "2022-12-02T13:59:24.640968Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pypath.share.curl.Curl at 0x6947386dc8b0>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.share import curl\n",
    "from pypath.inputs import depod\n",
    "\n",
    "with curl.preserve_on():\n",
    "    depod = depod.depod_enzyme_substrate()\n",
    "\n",
    "depod_curl = curl.LASTCURL\n",
    "depod_curl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:28.805129Z",
     "start_time": "2022-12-02T13:59:28.798642Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('http://depod.bioss.uni-freiburg.de/download/DEPOD_201405_human_phosphatase-substrate.mitab',\n",
       " [],\n",
       " <_io.TextIOWrapper name='/home/denes/.pypath/cache/6a711369ecf9dcff8c5ed88996685b54-DEPOD_201405_human_phosphatase-substrate.mitab' mode='r' encoding='iso-8859-1'>,\n",
       " 0)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "depod_curl.url, depod_curl.req_headers, depod_curl.fileobj, depod_curl.status"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Is it failing only for you?\n",
    "\n",
    "Okay, this is the one you should check first: we run almost all downloads in ``pypath`` daily, you can always check [in the report](https://status.omnipathdb.org/) wether a particular function run successfully last night on our server. If it fails also in our daily build, it still can be a transient error that disappears within a few days, or it can be a permanent error. In the latter case, we first try to fix the issue in pypath (maybe the behaviour or the address of the third party server has changed). If we have no way to fix it, we start [hosting the data on our own server](https://rescued.omnipathdb.org/) and make pypath download it from there."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read the log\n",
    "\n",
    "Above we mentioned a lot the pypath log. Here is how to access the log, see more details in the section about logging:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:34.574666Z",
     "start_time": "2022-12-02T13:59:34.563630Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 14:57:09] Welcome!\n",
      "[2022-12-02 14:57:09] Logger started, logging into `/home/denes/pypath/notebooks/pypath_log/pypath-s3e92.log`.\n",
      "[2022-12-02 14:57:09] Session `s3e92` started.\n",
      "[2022-12-02 14:57:09] [pypath] \n",
      "\t- session ID: `s3e92`\n",
      "\t- working directory: `/home/denes/pypath/notebooks`\n",
      "\t- logfile: `/home/denes/pypath/notebooks/pypath_log/pypath-s3e92.log`\n",
      "\t- pypath version: 0.14.30\n",
      "[2022-12-02 14:57:09] [curl] Creating Curl object to retrieve data from `https://www.ensembl.org/info/about/species.html`\n",
      "[2022-12-02 14:57:09] [curl] Cache file path: `/home/denes/.pypath/cache/535b06d53a59e75bb693369bc5fdc556-species.html`\n",
      "[2022-12-02 14:57:09] [curl] Cache file found, no need for download.\n",
      "[2022-12-02 14:57:09] [curl] Opening plain text file `/home/denes/.pypath/cache/535b06d53a59e75bb693369bc5fdc556-species.html`.\n",
      "[2022-12-02 14:57:09] [curl] Contents of `/home/denes/.pypath/cache/535b06d53a59e75bb693369bc5fdc556-species.html` has been read and the file has been closed.\n",
      "[2022-1"
     ]
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 112963 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pypath\n",
    "pypath.log()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TLS (SSL, HTTPS) errors\n",
    "\n",
    "Failed to verify certificate, invalid, expired, self-signed, missing certificates. These might be the most common reasons why people open issues for our software. TLS is a method for encrypted, typically HTTP, communication. The server has a certificate and uses it to sign and encrypt the data before sending it to the client. The client trusts the server certificate because it is signed by another certificate. And that is signed by another one, and so on, until we reach a so called root certificate that is known and trusted by the client. The number of root certificates used globally is so small that every single computer stores them locally and updates them time to time from trusted sources, such as the provider of the operating system, web browser or programming language. Having up-to-date certificate store and correctly configured TLS clients on your computer is your (or your system admin's) duty, we can here only give a generic procedure to address these issues. In 97% of the cases the issue is in your computer, but sometimes the server might be responsible. If you experience a TLS issue:\n",
    "\n",
    "- Check the status of the server: initiate a scan at a free TLS checking service, such as [SSL Labs](https://www.ssllabs.com/ssltest/): look for any issue with the certificate chain, such as missing or expired certificates, old or too new ciphers not supported by your client, etc.\n",
    "- Identify the server that your client failed to establish a TLS connection to (in case of `pypath`, look into the log)\n",
    "- Identify your software that contains the TLS client: in case of `pypath`, it uses `pycurl`, a Python module built on `libcurl`\n",
    "- Identify the provider of the client software: it can be PyPI, Anaconda, your operating system, etc.\n",
    "- Find out which certificate store that software uses: most of them uses the store from your operating system, but for example Java or Mozilla Firefox come with their own certificates\n",
    "- Check if the certificate store is up-to-date, update if necessary\n",
    "- Alternatively, identify the missing root certificate and add it manually to the store; you can also add a non-root certificate if the server has a serious issue and the chain can not be followed until a valid root certificate\n",
    "\n",
    "Please open TLS related issues for our software only if you\n",
    "\n",
    "- Experience a server side issue with omnipathdb.org\n",
    "- You have a strong reason to think the reason is in the code written by us or can be easily fixed within our code"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:27:45.625762Z",
     "start_time": "2022-12-03T14:27:45.604872Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pypath.resources.controller.ResourceController at 0x6cc25e25dcf0>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import resources\n",
    "rc = resources.get_controller()\n",
    "rc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Licenses\n",
    "\n",
    "The license of SIGNOR is CC BY-SA, it allows commercial (for-profit) use:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:27:47.473191Z",
     "start_time": "2022-12-03T14:27:47.467813Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<License CC BY-SA 4.0>, True)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rc.license('SIGNOR'), rc.license('SIGNOR').commercial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example: build a network for commercial use\n",
    "\n",
    "For our users, the most important aspect of licenses is whether they allow for-profit use in companies. In the near future we intend to provide more convenient interface for license options; until then, see the example below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T18:45:22.673540Z",
     "start_time": "2023-03-10T18:45:22.666877Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(24,\n",
       " 19,\n",
       " {<NetworkResource: Baccin2019 (post_translational, activity_flow)>,\n",
       "  <NetworkResource: Cellinker (post_translational, activity_flow)>,\n",
       "  <NetworkResource: HPMR (post_translational, activity_flow)>,\n",
       "  <NetworkResource: PDZBase (post_translational, activity_flow)>,\n",
       "  <NetworkResource: TRIP (post_translational, activity_flow)>})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import network\n",
    "from pypath import resources\n",
    "\n",
    "co = resources.get_controller()\n",
    "pw_academic = co.collect_network('pathway')\n",
    "pw_commercial = co.collect_network('pathway', license_purpose = 'commercial')\n",
    "\n",
    "len(pw_academic), len(pw_commercial), set(pw_academic.values()) - set(pw_commercial.values())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Above we see that five resources have been disabled by applying the for-profit licensing restriction. The licenses of those five resources:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T18:48:02.724885Z",
     "start_time": "2023-03-10T18:48:02.719589Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<License CC BY-NC-SA 3.0>,\n",
       " <License No license>,\n",
       " <License CC BY-NC 4.0>,\n",
       " <License CC BY-NC 4.0>,\n",
       " <License CC BY-NC 4.0>]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[r.license for r in set(pw_academic.values()) - set(pw_commercial.values())]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The licenses of the resources that allow for profit use:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T18:50:35.372347Z",
     "start_time": "2023-03-10T18:50:35.368225Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<License CC BY 4.0>,\n",
       " <License CC BY-SA 3.0>,\n",
       " <License CC BY-SA 3.0>,\n",
       " <License CC BY 4.0>,\n",
       " <License CC BY-SA 3.0>,\n",
       " <License CC BY-SA 3.0>,\n",
       " <License CC BY 4.0>,\n",
       " <License NAR Open Access>,\n",
       " <License CC BY-SA 4.0>,\n",
       " <License CC BY 4.0>,\n",
       " <License GPLv3>,\n",
       " <License GPLv3>,\n",
       " <License GPLv3>,\n",
       " <License MIT>,\n",
       " <License GPLv3>,\n",
       " <License MIT>,\n",
       " <License MIT>,\n",
       " <License CC BY 4.0>,\n",
       " <License GPLv3>]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[r.license for r in pw_commercial.values()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Taking a closer look at a non-profit license:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T18:54:45.204957Z",
     "start_time": "2023-03-10T18:54:45.200079Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<License purpose: academic>, False)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "license = pw_academic['trip'].license\n",
    "license.purpose, license.purpose.enables('for-profit')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The collected resources can be used directly to build databases, in this case a network database:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T18:57:02.979343Z",
     "start_time": "2023-03-10T18:56:00.192561Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<Network: 6833 nodes, 25607 interactions>,\n",
       " <Network: 6429 nodes, 23288 interactions>)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "net_academic = network.Network(pw_academic)\n",
    "net_commercial = network.Network(pw_commercial)\n",
    "net_academic, net_commercial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we see, the for-profit usable network is smaller by about 400 nodes and 2,300 edges, and it might miss even more of the fine grained details, but likely it is suitable for analysis. No legal expert here, but some thoughts about licenses: even if you work for a company, you might download and explore data under any license, the restrictions apply if you start to actually use the resource; even if some resources restrict commercial use, you can always contact the copyright owners and ask them for permission, or ask your company to pay them licensing fee, so you can legally use their product."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Resource information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T14:27:49.651220Z",
     "start_time": "2022-12-03T14:27:49.639678Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'yearUsedRelease': 2015,\n",
       " 'releases': [2009, 2011, 2015],\n",
       " 'urls': {'articles': ['http://bioinformatics.oxfordjournals.org/content/25/5/690.long',\n",
       "   'http://nar.oxfordjournals.org/content/43/D1/D321.long',\n",
       "   'http://nar.oxfordjournals.org/content/39/suppl_1/D235.long'],\n",
       "  'webpages': ['http://matrixdb.univ-lyon1.fr/'],\n",
       "  'omictools': ['http://omictools.com/matrixdb-tool']},\n",
       " 'pubmeds': [19147664, 20852260, 25378329],\n",
       " 'taxons': ['mammalia'],\n",
       " 'annot': ['experiment'],\n",
       " 'recommend': ['small, literature curated interaction resource; many interactions for',\n",
       "  'receptors and extracellular proteins'],\n",
       " 'descriptions': ['Protein data were imported from the UniProtKB/Swiss-Prot database (Bairoch et',\n",
       "  'al., 2005) and identified by UniProtKB/SwissProt accession numbers. In order to',\n",
       "  'list all the partners of a protein, interactions are associated by default to the',\n",
       "  'accession number of the human protein. The actual source species used in experiments is',\n",
       "  'indicated in the page repor"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 4479 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "rc['MatrixDB']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Resource definitions for a certain database or dataset\n",
    "\n",
    "<div class=\"alert alert-block alert-success\"><b>Note:</b> This does not work yet for all databases and datasets, but likely in the near future this will be the preferred method to access resource definitions.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T20:08:29.219348Z",
     "start_time": "2022-12-02T20:08:29.213904Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<EnzymeSubstrateResource: phosphoELM>,\n",
       " <EnzymeSubstrateResource: dbPTM>,\n",
       " <EnzymeSubstrateResource: SIGNOR>,\n",
       " <EnzymeSubstrateResource: HPRD>,\n",
       " <EnzymeSubstrateResource: Li2012>,\n",
       " <EnzymeSubstrateResource: DEPOD>,\n",
       " <EnzymeSubstrateResource: PhosphoSite>,\n",
       " <EnzymeSubstrateResource: PhosphoNetworks>,\n",
       " <EnzymeSubstrateResource: MIMP>,\n",
       " <EnzymeSubstrateResource: ProtMapper>,\n",
       " <EnzymeSubstrateResource: KEA>]"
      ]
     },
     "execution_count": 197,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rc.collect_enzyme_substrate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The resource definitions carry all information necessary to load the resource, for example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T20:09:51.277700Z",
     "start_time": "2022-12-02T20:09:51.271494Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('phosphoelm.phosphoelm_enzyme_substrate', 'uniprot')"
      ]
     },
     "execution_count": 202,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phosphoelm = rc.collect_enzyme_substrate()[0]\n",
    "phosphoelm.input_method, phosphoelm.id_type_enzyme"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building networks <a class=\"anchor\" id=\"building-networks\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For this you will need the `Network` class from the `pypath.core.network` module which takes care about building and querying the network. Also you need the `pypath.resources.network` module where you find a number of predefined input settings organized in larger categories (e.g. activity flow, enzyme-substrate, transcriptional regulation, etc). These input settings will tell `pypath` how to download and process the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:49.395955Z",
     "start_time": "2022-12-02T13:59:49.392650Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.core import network\n",
    "from pypath.resources import network as netres"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For example the `netres.pathway` is a collection of databases which fit into the activity flow concept, i.e. one protein either stimulates or inhibits the other. It is a dictionary with names as keys and the input settings as values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T13:59:52.071557Z",
     "start_time": "2022-12-02T13:59:52.066832Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'trip': <NetworkResource: TRIP (post_translational, activity_flow)>,\n",
       " 'spike': <NetworkResource: SPIKE (post_translational, activity_flow)>,\n",
       " 'signalink3': <NetworkResource: SignaLink3 (post_translational, activity_flow)>,\n",
       " 'guide2pharma': <NetworkResource: Guide2Pharma (post_translational, activity_flow)>,\n",
       " 'ca1': <NetworkResource: CA1 (post_translational, activity_flow)>,\n",
       " 'arn': <NetworkResource: ARN (post_translational, activity_flow)>,\n",
       " 'nrf2': <NetworkResource: NRF2ome (post_translational, activity_flow)>,\n",
       " 'macrophage': <NetworkResource: Macrophage (post_translational, activity_flow)>,\n",
       " 'death': <NetworkResource: DeathDomain (post_translational, activity_flow)>,\n",
       " 'pdz': <NetworkResource: PDZBase (post_translational, activity_flow)>,\n",
       " 'signor': <NetworkResource: SIGNOR (post_translational, activity_flow)>,\n",
       " 'adhesome': <NetworkResource: Adhesome (post_translational, activity_flow)>,\n",
       " 'icellnet': <NetworkResource: ICELLNET (post_translational, activity_flow)>,\n",
       " 'celltalkdb': <Net"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1864 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "netres.pathway"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Such a dictionary you can pass to the `load` method of the `network.Network` object. Then it will download the data from the original sources, translate the identifiers and merge the networks. Pypath stores all downloaded data in a cache, by default `~/.pypath/cache` in your user's home directory. For this reason when you load a resource for the first time it might take long but next time will be faster as data will be fetched from the cache. First create a `pypath.network.Network` object, then build the network:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:00:36.477296Z",
     "start_time": "2022-12-02T14:00:03.572789Z"
    }
   },
   "outputs": [],
   "source": [
    "n = network.Network()\n",
    "n.load(netres.pathway)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:02:23.812998Z",
     "start_time": "2022-12-02T14:02:23.808181Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 6833 nodes, 25607 interactions>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can add more resource sets a similar way:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:04:29.020210Z",
     "start_time": "2022-12-02T14:03:58.982627Z"
    }
   },
   "outputs": [],
   "source": [
    "n.load(netres.enzyme_substrate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:05:38.101342Z",
     "start_time": "2022-12-02T14:05:38.096034Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 7979 nodes, 35550 interactions>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To load one single resource simply pass the `NetworkResource` directly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:05:42.298282Z",
     "start_time": "2022-12-02T14:05:42.128508Z"
    }
   },
   "outputs": [],
   "source": [
    "n.load(netres.interaction['matrixdb'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:05:44.218838Z",
     "start_time": "2022-12-02T14:05:44.215190Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 8002 nodes, 35748 interactions>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Which network datasets are pre-defined in pypath? <a class=\"anchor\" id=\"network-resources\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can find all the pre-defined datasets in the ``pypath.resources.network`` module. This module currently is a wrapper around an older module, ``pypath.resources.data_formats``, the actual definitions are written in this latter. As already we mentined above, the `pathway` dataset contains the literature curated activity flow resources. This was the original focus of pypath and OmniPath, however since then we added a great variety of other kinds of resource definitions. Here we give an overview of these."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `pypath.resources.network.pathway`: activity flow networks with literature references\n",
    "* `pypath.resources.network.activity_flow`: synonym for `pathway`\n",
    "* `pypath.resources.network.pathway_noref`: activity flow networks without literature references\n",
    "* `pypath.resources.network.pathway_all`: all activity flow data\n",
    "* `pypath.resources.network.ptm`: enzyme-substrate interaction networks with literature references\n",
    "* `pypath.resources.network.enzyme_substrate`: synonym for `ptm`\n",
    "* `pypath.resources.network.ptm_noref`: enzyme-substrate networks without literature references\n",
    "* `pypath.resources.network.ptm_all`: all enzyme-substrate data\n",
    "* `pypath.resources.network.interaction`: undirected interactions from both literature curated and high-throughput collections (e.g. IntAct, BioGRID)\n",
    "* `pypath.resources.network.interaction_misc`: undirected, high-scale interaction networks without the constraint of having any literature reference (e.g. the unbiased human interactome screen from the Vidal lab)\n",
    "* `pypath.resources.network.transcription_onebyone`: transcriptional regulation databases (TF-target interactions) with all databases downloaded directly and processed by `pypath`\n",
    "* `pypath.resources.network.transcription`: transcriptional regulation only from the DoRothEA data\n",
    "* `pypath.resources.network.mirna_target`: miRNA-mRNA interactions from literature curated resources\n",
    "* `pypath.resources.network.tf_mirna`: transcriptional regulation of miRNA from literature curated resources\n",
    "* `pypath.resources.network.lncrna_protein`: lncRNA-protein interactions from literature curated datasets\n",
    "* `pypath.resources.network.ligand_receptor`: ligand-receptor interactions from both literature curated and other kinds of resources\n",
    "* `pypath.resources.network.pathwaycommons`: the PathwayCommons database\n",
    "* `pypath.resources.network.reaction`: process description databases; not guaranteed to work at this moment\n",
    "* `pypath.resources.network.reaction_misc`: alternative definitions to load process description databases; not guaranteed to work at this moment\n",
    "* `pypath.resources.network.small_molecule_protein`: signaling interactions between small molecules and proteins"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To see the list of the resources in a dataset, you can check the dict keys or the `name` attribute of each element:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:05:57.168867Z",
     "start_time": "2022-12-02T14:05:57.160847Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['trip', 'spike', 'signalink3', 'guide2pharma', 'ca1', 'arn', 'nrf2', 'macrophage', 'death', 'pdz', 'signor', 'adhesome', 'icellnet', 'celltalkdb', 'cellchatdb', 'connectomedb', 'talklr', 'cellinker', 'scconnect', 'hpmr', 'cellphonedb', 'ramilowski2015', 'lrdb', 'baccin2019'])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "netres.pathway.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:00.992851Z",
     "start_time": "2022-12-02T14:06:00.982125Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TRIP',\n",
       " 'SPIKE',\n",
       " 'SignaLink3',\n",
       " 'Guide2Pharma',\n",
       " 'CA1',\n",
       " 'ARN',\n",
       " 'NRF2ome',\n",
       " 'Macrophage',\n",
       " 'DeathDomain',\n",
       " 'PDZBase',\n",
       " 'SIGNOR',\n",
       " 'Adhesome',\n",
       " 'ICELLNET',\n",
       " 'CellTalkDB',\n",
       " 'CellChatDB',\n",
       " 'connectomeDB2020',\n",
       " 'talklr',\n",
       " 'Cellinker',\n",
       " 'scConnect',\n",
       " 'HPMR',\n",
       " 'CellPhoneDB',\n",
       " 'Ramilowski2015',\n",
       " 'LRdb',\n",
       " 'Baccin2019']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[resource.name for resource in netres.pathway.values()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The resource definitions above carry all the information about how to load the resource: which function to call, how to process the identifiers, references, directions, and all other attributes from the input. E.g. which column from SPIKE corresponds to the source node? Which identifier type is used? It is the second column, and it has gene symbols in it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:07.285628Z",
     "start_time": "2022-12-02T14:06:07.277820Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 'genesymbol')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "netres.pathway['spike'].networkinput.id_col_a, netres.pathway['spike'].networkinput.id_type_a"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The `Network` object"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once you built a network you can use it for various purposes and write your own scripts for further processing or analysis. Below we create a `Network` object and populate it with the `pathway` dataset.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> it is recommended to access databases <a href=\"#nw-dbmanager\">by the manager</a>. Running the code below takes really long and does not save or reload the database, it builds a fresh copy each time.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:15:48.138806Z",
     "start_time": "2022-12-02T14:15:12.068028Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Network: 6833 nodes, 25607 interactions>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import network\n",
    "from pypath.resources import network as netres\n",
    "\n",
    "n = network.Network()\n",
    "n.load(netres.pathway)\n",
    "n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Almost all data is stored as a dict node pairs vs. interactions in `Network.interactions`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:02.706809Z",
     "start_time": "2022-12-02T14:17:02.614361Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{(<Entity: TRPC1>,\n",
       "  <Entity: KCNMA1>): <Interaction: TRPC1 ============= KCNMA1 [Evidences: TRIP (2 references)]>,\n",
       " (<Entity: TRPC1>,\n",
       "  <Entity: PPP3CA>): <Interaction: TRPC1 ============= PPP3CA [Evidences: TRIP (1 references)]>,\n",
       " (<Entity: CALM2>,\n",
       "  <Entity: TRPC1>): <Interaction: CALM2 =======(-)==> TRPC1 [Evidences: TRIP (3 references)]>,\n",
       " (<Entity: CALM3>,\n",
       "  <Entity: TRPC1>): <Interaction: CALM3 =======(-)==> TRPC1 [Evidences: TRIP (3 references)]>,\n",
       " (<Entity: CALM1>,\n",
       "  <Entity: TRPC1>): <Interaction: CALM1 =======(-)==> TRPC1 [Evidences: TRIP (3 references)]>,\n",
       " (<Entity: CASP1>,\n",
       "  <Entity: TRPC1>): <Interaction: CASP1 ============= TRPC1 [Evidences: TRIP (1 references)]>,\n",
       " (<Entity: TRPC1>,\n",
       "  <Entity: CASP4>): <Interaction: TRPC1 ============= CASP4 [Evidences: TRIP (1 references)]>,\n",
       " (<Entity: TRPC1>,\n",
       "  <Entity: CACNA1C>): <Interaction: TRPC1 ============= CACNA1C [Evidences: TRIP (1 references)]>,\n",
       " (<Entity: TRPC1>,\n",
       "  <Entity: CAV1>): <Interaction: TRPC1 <=(+)======== CAV1 [Ev"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 118492 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "n.interactions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dict under `Network.nodes` is kept in sync with the interactions, and facilitates node access. Keys are primary identifiers (for proteins UniProt IDs by default), values are `Entity` objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:21.933545Z",
     "start_time": "2022-12-02T14:06:21.903749Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'P48995': <Entity: TRPC1>,\n",
       " 'Q12791': <Entity: KCNMA1>,\n",
       " 'Q08209': <Entity: PPP3CA>,\n",
       " 'P0DP24': <Entity: CALM2>,\n",
       " 'P0DP25': <Entity: CALM3>,\n",
       " 'P0DP23': <Entity: CALM1>,\n",
       " 'P29466': <Entity: CASP1>,\n",
       " 'P49662': <Entity: CASP4>,\n",
       " 'Q13936': <Entity: CACNA1C>,\n",
       " 'Q03135': <Entity: CAV1>,\n",
       " 'P56539': <Entity: CAV3>,\n",
       " 'Q14247': <Entity: CTTN>,\n",
       " 'P14416': <Entity: DRD2>,\n",
       " 'P11532': <Entity: DMD>,\n",
       " 'P11362': <Entity: FGFR1>,\n",
       " 'Q02790': <Entity: FKBP4>,\n",
       " 'Q86YM7': <Entity: HOMER1>,\n",
       " 'Q9NSC5': <Entity: HOMER3>,\n",
       " 'Q99750': <Entity: MDFI>,\n",
       " 'Q14571': <Entity: ITPR2>,\n",
       " 'Q14573': <Entity: ITPR3>,\n",
       " 'P29966': <Entity: MARCKS>,\n",
       " 'Q13255': <Entity: GRM1>,\n",
       " 'P20591': <Entity: MX1>,\n",
       " 'P62166': <Entity: NCS1>,\n",
       " 'Q96D31': <Entity: ORAI1>,\n",
       " 'Q96SN7': <Entity: ORAI2>,\n",
       " 'Q9BRQ5': <Entity: ORAI3>,\n",
       " 'P11171': <Entity: EPB41>,\n",
       " 'P61586': <Entity: RHOA>,\n",
       " 'Q9Y225': <Entity: RNF24>,\n",
       " 'P21817': <Entity: RYR1>,\n",
       " 'P16615': <Entity: ATP2A2>,\n",
       " 'Q93084': <Entity: ATP2A3>,\n",
       " 'P60880': <Entity: SNAP25>,\n",
       " 'Q13586': <Entity: STI"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 30573 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "n.nodes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An interaction between a pair of entities can be accessed:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:27.158077Z",
     "start_time": "2022-12-02T14:06:27.148686Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Interaction: EGFR <=(+)======== EGF [Evidences: Baccin2019, CellTalkDB, Fantom5, Guide2Pharma, HPMR, HPRD, ICELLNET, LRdb, Ramilowski2015, SIGNOR, SPIKE, SignaLink3, cellsignal.com, connectomeDB2020 (17 references)]>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.interaction('EGF', 'EGFR')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similarly, individual nodes can be looked up:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:29.156341Z",
     "start_time": "2022-12-02T14:06:29.150576Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Entity: EGFR>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.entity('EGFR')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Labels (gene symbols for proteins by default), identifiers (such as UniProt IDs) and `Entity` objects can be used to refer to nodes. Each node carries some basic information:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:32.405576Z",
     "start_time": "2022-12-02T14:06:32.400917Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('P00533', 'EGFR', 'protein', 'uniprot', 9606)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "egfr = n.entity('EGFR')\n",
    "egfr.identifier, egfr.label, egfr.entity_type, egfr.id_type, egfr.taxon"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Interactions feature a number of methods to access various information, such as their types, direction, effect, resources, references, etc. The very same methods are also available for the whole network. Below we only show a few examples of these methods."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:34.678142Z",
     "start_time": "2022-12-02T14:06:34.673243Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Interaction: EGFR <=(+)======== EGF [Evidences: Baccin2019, CellTalkDB, Fantom5, Guide2Pharma, HPMR, HPRD, ICELLNET, LRdb, Ramilowski2015, SIGNOR, SPIKE, SignaLink3, cellsignal.com, connectomeDB2020 (17 references)]>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia = n.interaction('EGF', 'EGFR')\n",
    "ia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:47.686986Z",
     "start_time": "2022-12-02T14:06:47.681539Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Baccin2019',\n",
       " 'CellTalkDB',\n",
       " 'HPMR',\n",
       " 'ICELLNET',\n",
       " 'LRdb',\n",
       " 'SIGNOR',\n",
       " 'SPIKE',\n",
       " 'SignaLink3',\n",
       " 'connectomeDB2020'}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_resource_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:50.693823Z",
     "start_time": "2022-12-02T14:06:50.690237Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{<Reference: 10085134>,\n",
       " <Reference: 10209155>,\n",
       " <Reference: 10788520>,\n",
       " <Reference: 12093292>,\n",
       " <Reference: 12297050>,\n",
       " <Reference: 12620237>,\n",
       " <Reference: 12648462>,\n",
       " <Reference: 15620700>,\n",
       " <Reference: 16274239>,\n",
       " <Reference: 17145710>,\n",
       " <Reference: 19531499>,\n",
       " <Reference: 20458382>,\n",
       " <Reference: 21071413>,\n",
       " <Reference: 23331499>,\n",
       " <Reference: 3494473>,\n",
       " <Reference: 6289330>,\n",
       " <Reference: 8639530>}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_references()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is a valid direction for this interaction:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:53.319799Z",
     "start_time": "2022-12-02T14:06:53.311258Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_direction(('EGF', 'EGFR'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The opposite direction is not supported by any of the resources:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:06:55.487576Z",
     "start_time": "2022-12-02T14:06:55.480603Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_direction(('EGFR', 'EGF'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, some resources provide no direction information, these are classified as *\"undirected\"*:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ia.get_direction('undirected')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can check which resources are those exactly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:23.720763Z",
     "start_time": "2022-12-02T14:07:23.713450Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'HPMR', 'SPIKE'}"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_direction('undirected', sources = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Effect signs (stimulation, inhibition) are available in a similar way. The first one of the Boolean values mean stimulation (activation), the second one inhibition."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:25.740206Z",
     "start_time": "2022-12-02T14:07:25.734527Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[True, False]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_sign(('EGF', 'EGFR'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Which resources support the effect signs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:28.761822Z",
     "start_time": "2022-12-02T14:07:28.754876Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'SIGNOR', 'SPIKE', 'SignaLink3'}, set()]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_sign(('EGF', 'EGFR'), sources = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Many methods start by `get_...`, such as:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:30.422085Z",
     "start_time": "2022-12-02T14:07:30.417861Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'post_translational'}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_interaction_types()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Others are called `..._by_...`, these combine two `get_...` methods:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:32.499528Z",
     "start_time": "2022-12-02T14:07:32.494759Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ICELLNET': {<Reference: 8639530>},\n",
       " 'SIGNOR': {<Reference: 12297050>, <Reference: 12648462>},\n",
       " 'SignaLink3': {<Reference: 10085134>,\n",
       "  <Reference: 10209155>,\n",
       "  <Reference: 19531499>,\n",
       "  <Reference: 21071413>,\n",
       "  <Reference: 23331499>},\n",
       " 'Baccin2019': {<Reference: 10788520>,\n",
       "  <Reference: 12093292>,\n",
       "  <Reference: 12297050>,\n",
       "  <Reference: 12620237>,\n",
       "  <Reference: 15620700>,\n",
       "  <Reference: 16274239>,\n",
       "  <Reference: 6289330>},\n",
       " 'LRdb': {<Reference: 10788520>,\n",
       "  <Reference: 12093292>,\n",
       "  <Reference: 12297050>,\n",
       "  <Reference: 12620237>,\n",
       "  <Reference: 15620700>,\n",
       "  <Reference: 16274239>,\n",
       "  <Reference: 6289330>},\n",
       " 'SPIKE': {<Reference: 12297050>,\n",
       "  <Reference: 17145710>,\n",
       "  <Reference: 20458382>,\n",
       "  <Reference: 3494473>},\n",
       " 'CellTalkDB': {<Reference: 12093292>},\n",
       " 'connectomeDB2020': {<Reference: 10788520>,\n",
       "  <Reference: 12093292>,\n",
       "  <Reference: 12297050>,\n",
       "  <Reference: 12620237>,\n",
       "  <Reference: 15620700>,\n",
       "  <Reference: 16274239>,\n",
       "  <Reference: 6289330>},\n",
       " 'HPMR': {<Reference: 6289330>}}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.references_by_resource()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And all these methods accept the same filtering parameters. E.g. if you are interested only in certain resources, it's possible to restrict the query to those. For example, the two resources below provide no positive sign interaction: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:39.803804Z",
     "start_time": "2022-12-02T14:07:39.799680Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "()"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_interactions_positive(resources = {'ICELLNET', 'HPMR'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "While some other resources do:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:42.299260Z",
     "start_time": "2022-12-02T14:07:42.295822Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((<Entity: EGF>, <Entity: EGFR>),)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_interactions_positive(resources = {'SignaLink3'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or see the references that do or do not provide effect sign:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:07:44.630204Z",
     "start_time": "2022-12-02T14:07:44.624656Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({<Reference: 10085134>,\n",
       "  <Reference: 10209155>,\n",
       "  <Reference: 12297050>,\n",
       "  <Reference: 12648462>,\n",
       "  <Reference: 19531499>,\n",
       "  <Reference: 20458382>,\n",
       "  <Reference: 21071413>,\n",
       "  <Reference: 23331499>},\n",
       " {<Reference: 10085134>,\n",
       "  <Reference: 10209155>,\n",
       "  <Reference: 10788520>,\n",
       "  <Reference: 12093292>,\n",
       "  <Reference: 12297050>,\n",
       "  <Reference: 12620237>,\n",
       "  <Reference: 12648462>,\n",
       "  <Reference: 15620700>,\n",
       "  <Reference: 16274239>,\n",
       "  <Reference: 17145710>,\n",
       "  <Reference: 19531499>,\n",
       "  <Reference: 20458382>,\n",
       "  <Reference: 21071413>,\n",
       "  <Reference: 23331499>,\n",
       "  <Reference: 3494473>,\n",
       "  <Reference: 6289330>,\n",
       "  <Reference: 8639530>})"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ia.get_references(effect = True), ia.get_references(effect = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Network in *pandas.DataFrame* <a class=\"anchor\" id=\"network-pandas\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Contents of a `pypath.core.network.Network` object can be exported to a `pandas.DataFrame`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:24:19.479911Z",
     "start_time": "2022-12-03T15:23:56.070917Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_a</th>\n",
       "      <th>id_b</th>\n",
       "      <th>type_a</th>\n",
       "      <th>type_b</th>\n",
       "      <th>directed</th>\n",
       "      <th>effect</th>\n",
       "      <th>type</th>\n",
       "      <th>dmodel</th>\n",
       "      <th>sources</th>\n",
       "      <th>references</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q12791</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{activity_flow}</td>\n",
       "      <td>{TRIP}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q08209</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{activity_flow}</td>\n",
       "      <td>{TRIP}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>P0DP23</td>\n",
       "      <td>P48995</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>-1</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{activity_flow}</td>\n",
       "      <td>{TRIP}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>P0DP25</td>\n",
       "      <td>P48995</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>-1</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{activity_flow}</td>\n",
       "      <td>{TRIP}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>P0DP24</td>\n",
       "      <td>P48995</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>-1</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{activity_flow}</td>\n",
       "      <td>{TRIP}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44033</th>\n",
       "      <td>Q14289</td>\n",
       "      <td>Q9ULZ3</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{enzyme_substrate}</td>\n",
       "      <td>{iPTMnet}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44034</th>\n",
       "      <td>P54646</td>\n",
       "      <td>Q9Y2I7</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{enzyme_substrate}</td>\n",
       "      <td>{iPTMnet}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44035</th>\n",
       "      <td>Q9BXM7</td>\n",
       "      <td>Q9Y2N7</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{enzyme_substrate}</td>\n",
       "      <td>{iPTMnet}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44036</th>\n",
       "      <td>P49137</td>\n",
       "      <td>Q9Y385</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{enzyme_substrate}</td>\n",
       "      <td>{iPTMnet}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44037</th>\n",
       "      <td>Q9UHC7</td>\n",
       "      <td>P04637</td>\n",
       "      <td>protein</td>\n",
       "      <td>protein</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>post_translational</td>\n",
       "      <td>{enzyme_substrate}</td>\n",
       "      <td>{iPTMnet}</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>44038 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         id_a    id_b   type_a   type_b  directed  effect                type  \\\n",
       "0      P48995  Q12791  protein  protein     False       0  post_translational   \n",
       "1      P48995  Q08209  protein  protein     False       0  post_translational   \n",
       "2      P0DP23  P48995  protein  protein      True      -1  post_translational   \n",
       "3      P0DP25  P48995  protein  protein      True      -1  post_translational   \n",
       "4      P0DP24  P48995  protein  protein      True      -1  post_translational   \n",
       "...       ...     ...      ...      ...       ...     ...                 ...   \n",
       "44033  Q14289  Q9ULZ3  protein  protein      True       0  post_translational   \n",
       "44034  P54646  Q9Y2I7  protein  protein      True       0  post_translational   \n",
       "44035  Q9BXM7  Q9Y2N7  protein  protein      True       0  post_translational   \n",
       "44036  P49137  Q9Y385  protein  protein      True       0  post_translational   \n",
       "44037  Q9UHC7  P04637  protein  protein      True       0  post_translational   \n",
       "\n",
       "                   dmodel    sources references  \n",
       "0         {activity_flow}     {TRIP}        NaN  \n",
       "1         {activity_flow}     {TRIP}        NaN  \n",
       "2         {activity_flow}     {TRIP}        NaN  \n",
       "3         {activity_flow}     {TRIP}        NaN  \n",
       "4         {activity_flow}     {TRIP}        NaN  \n",
       "...                   ...        ...        ...  \n",
       "44033  {enzyme_substrate}  {iPTMnet}        NaN  \n",
       "44034  {enzyme_substrate}  {iPTMnet}        NaN  \n",
       "44035  {enzyme_substrate}  {iPTMnet}        NaN  \n",
       "44036  {enzyme_substrate}  {iPTMnet}        NaN  \n",
       "44037  {enzyme_substrate}  {iPTMnet}        NaN  \n",
       "\n",
       "[44038 rows x 10 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "cu = omnipath.db.get_db('curated')\n",
    "cu.make_df()\n",
    "cu.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the `pypath.omnipath.export` module independent and more flexible interfaces are available for building network data frames. These are used also for building the tables used by the web server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:20:12.602863Z",
     "start_time": "2023-03-10T19:19:49.951634Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>source_genesymbol</th>\n",
       "      <th>target_genesymbol</th>\n",
       "      <th>is_directed</th>\n",
       "      <th>is_stimulation</th>\n",
       "      <th>is_inhibition</th>\n",
       "      <th>consensus_direction</th>\n",
       "      <th>consensus_stimulation</th>\n",
       "      <th>consensus_inhibition</th>\n",
       "      <th>sources</th>\n",
       "      <th>references</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q12791</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>KCNMA1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>TRIP</td>\n",
       "      <td>TRIP:19168436;TRIP:25139746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q08209</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>PPP3CA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>TRIP</td>\n",
       "      <td>TRIP:23228564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>P0DP23</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM1</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>TRIP</td>\n",
       "      <td>TRIP:11290752;TRIP:11983166;TRIP:12601176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>P0DP25</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM3</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>TRIP</td>\n",
       "      <td>TRIP:11290752;TRIP:11983166;TRIP:12601176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>P0DP24</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM2</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>TRIP</td>\n",
       "      <td>TRIP:11290752;TRIP:11983166;TRIP:12601176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36729</th>\n",
       "      <td>Q14289</td>\n",
       "      <td>Q9ULZ3</td>\n",
       "      <td>PTK2B</td>\n",
       "      <td>PYCARD</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>iPTMnet</td>\n",
       "      <td>iPTMnet:27796369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36730</th>\n",
       "      <td>P54646</td>\n",
       "      <td>Q9Y2I7</td>\n",
       "      <td>PRKAA2</td>\n",
       "      <td>PIKFYVE</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>iPTMnet</td>\n",
       "      <td>iPTMnet:24070423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36731</th>\n",
       "      <td>Q9BXM7</td>\n",
       "      <td>Q9Y2N7</td>\n",
       "      <td>PINK1</td>\n",
       "      <td>HIF3A</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>iPTMnet</td>\n",
       "      <td>iPTMnet:27551449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36732</th>\n",
       "      <td>P49137</td>\n",
       "      <td>Q9Y385</td>\n",
       "      <td>MAPKAPK2</td>\n",
       "      <td>UBE2J1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>iPTMnet</td>\n",
       "      <td>iPTMnet:24020373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36733</th>\n",
       "      <td>Q9UHC7</td>\n",
       "      <td>P04637</td>\n",
       "      <td>MKRN1</td>\n",
       "      <td>TP53</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>iPTMnet</td>\n",
       "      <td>iPTMnet:19536131</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>36734 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       source  target source_genesymbol target_genesymbol  is_directed  \\\n",
       "0      P48995  Q12791             TRPC1            KCNMA1            0   \n",
       "1      P48995  Q08209             TRPC1            PPP3CA            0   \n",
       "2      P0DP23  P48995             CALM1             TRPC1            1   \n",
       "3      P0DP25  P48995             CALM3             TRPC1            1   \n",
       "4      P0DP24  P48995             CALM2             TRPC1            1   \n",
       "...       ...     ...               ...               ...          ...   \n",
       "36729  Q14289  Q9ULZ3             PTK2B            PYCARD            1   \n",
       "36730  P54646  Q9Y2I7            PRKAA2           PIKFYVE            1   \n",
       "36731  Q9BXM7  Q9Y2N7             PINK1             HIF3A            1   \n",
       "36732  P49137  Q9Y385          MAPKAPK2            UBE2J1            1   \n",
       "36733  Q9UHC7  P04637             MKRN1              TP53            1   \n",
       "\n",
       "       is_stimulation  is_inhibition  consensus_direction  \\\n",
       "0                   0              0                    0   \n",
       "1                   0              0                    0   \n",
       "2                   0              1                    1   \n",
       "3                   0              1                    1   \n",
       "4                   0              1                    1   \n",
       "...               ...            ...                  ...   \n",
       "36729               0              0                    0   \n",
       "36730               0              0                    0   \n",
       "36731               0              0                    0   \n",
       "36732               0              0                    0   \n",
       "36733               0              0                    0   \n",
       "\n",
       "       consensus_stimulation  consensus_inhibition  sources  \\\n",
       "0                          0                     0     TRIP   \n",
       "1                          0                     0     TRIP   \n",
       "2                          0                     1     TRIP   \n",
       "3                          0                     1     TRIP   \n",
       "4                          0                     1     TRIP   \n",
       "...                      ...                   ...      ...   \n",
       "36729                      0                     0  iPTMnet   \n",
       "36730                      0                     0  iPTMnet   \n",
       "36731                      0                     0  iPTMnet   \n",
       "36732                      0                     0  iPTMnet   \n",
       "36733                      0                     0  iPTMnet   \n",
       "\n",
       "                                      references  \n",
       "0                    TRIP:19168436;TRIP:25139746  \n",
       "1                                  TRIP:23228564  \n",
       "2      TRIP:11290752;TRIP:11983166;TRIP:12601176  \n",
       "3      TRIP:11290752;TRIP:11983166;TRIP:12601176  \n",
       "4      TRIP:11290752;TRIP:11983166;TRIP:12601176  \n",
       "...                                          ...  \n",
       "36729                           iPTMnet:27796369  \n",
       "36730                           iPTMnet:24070423  \n",
       "36731                           iPTMnet:27551449  \n",
       "36732                           iPTMnet:24020373  \n",
       "36733                           iPTMnet:19536131  \n",
       "\n",
       "[36734 rows x 12 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "from pypath.omnipath import export\n",
    "\n",
    "cu = omnipath.db.get_db('curated')\n",
    "e = export.Export(cu)\n",
    "e.make_df(unique_pairs = False)\n",
    "e.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The data frame built for the web service includes even more details. Using the `extra_node_attrs` and `extra_edge_attrs` arguments of the `Export` object, you can fully customise these data frames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:22:51.003855Z",
     "start_time": "2023-03-10T19:22:29.009928Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>source_genesymbol</th>\n",
       "      <th>target_genesymbol</th>\n",
       "      <th>is_directed</th>\n",
       "      <th>is_stimulation</th>\n",
       "      <th>is_inhibition</th>\n",
       "      <th>consensus_direction</th>\n",
       "      <th>consensus_stimulation</th>\n",
       "      <th>consensus_inhibition</th>\n",
       "      <th>...</th>\n",
       "      <th>dorothea_tfbs</th>\n",
       "      <th>dorothea_coexp</th>\n",
       "      <th>dorothea_level</th>\n",
       "      <th>type</th>\n",
       "      <th>curation_effort</th>\n",
       "      <th>extra_attrs</th>\n",
       "      <th>ncbi_tax_id_source</th>\n",
       "      <th>entity_type_source</th>\n",
       "      <th>ncbi_tax_id_target</th>\n",
       "      <th>entity_type_target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q12791</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>KCNMA1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>2</td>\n",
       "      <td>{\"TRIP_method\":[\"Co-immunoprecipitation\",\"Co-i...</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>P48995</td>\n",
       "      <td>Q08209</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>PPP3CA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{\"TRIP_method\":[\"Co-immunoprecipitation\"]}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>P0DP23</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM1</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>3</td>\n",
       "      <td>{\"TRIP_method\":[\"Fluorescence probe labeling\",...</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>P0DP25</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM3</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>3</td>\n",
       "      <td>{\"TRIP_method\":[\"Fluorescence probe labeling\",...</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>P0DP24</td>\n",
       "      <td>P48995</td>\n",
       "      <td>CALM2</td>\n",
       "      <td>TRPC1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>3</td>\n",
       "      <td>{\"TRIP_method\":[\"Fluorescence probe labeling\",...</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36729</th>\n",
       "      <td>Q14289</td>\n",
       "      <td>Q9ULZ3</td>\n",
       "      <td>PTK2B</td>\n",
       "      <td>PYCARD</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36730</th>\n",
       "      <td>P54646</td>\n",
       "      <td>Q9Y2I7</td>\n",
       "      <td>PRKAA2</td>\n",
       "      <td>PIKFYVE</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36731</th>\n",
       "      <td>Q9BXM7</td>\n",
       "      <td>Q9Y2N7</td>\n",
       "      <td>PINK1</td>\n",
       "      <td>HIF3A</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36732</th>\n",
       "      <td>P49137</td>\n",
       "      <td>Q9Y385</td>\n",
       "      <td>MAPKAPK2</td>\n",
       "      <td>UBE2J1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36733</th>\n",
       "      <td>Q9UHC7</td>\n",
       "      <td>P04637</td>\n",
       "      <td>MKRN1</td>\n",
       "      <td>TP53</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>post_translational</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>36734 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       source  target source_genesymbol target_genesymbol  is_directed  \\\n",
       "0      P48995  Q12791             TRPC1            KCNMA1            0   \n",
       "1      P48995  Q08209             TRPC1            PPP3CA            0   \n",
       "2      P0DP23  P48995             CALM1             TRPC1            1   \n",
       "3      P0DP25  P48995             CALM3             TRPC1            1   \n",
       "4      P0DP24  P48995             CALM2             TRPC1            1   \n",
       "...       ...     ...               ...               ...          ...   \n",
       "36729  Q14289  Q9ULZ3             PTK2B            PYCARD            1   \n",
       "36730  P54646  Q9Y2I7            PRKAA2           PIKFYVE            1   \n",
       "36731  Q9BXM7  Q9Y2N7             PINK1             HIF3A            1   \n",
       "36732  P49137  Q9Y385          MAPKAPK2            UBE2J1            1   \n",
       "36733  Q9UHC7  P04637             MKRN1              TP53            1   \n",
       "\n",
       "       is_stimulation  is_inhibition  consensus_direction  \\\n",
       "0                   0              0                    0   \n",
       "1                   0              0                    0   \n",
       "2                   0              1                    1   \n",
       "3                   0              1                    1   \n",
       "4                   0              1                    1   \n",
       "...               ...            ...                  ...   \n",
       "36729               0              0                    0   \n",
       "36730               0              0                    0   \n",
       "36731               0              0                    0   \n",
       "36732               0              0                    0   \n",
       "36733               0              0                    0   \n",
       "\n",
       "       consensus_stimulation  consensus_inhibition  ... dorothea_tfbs  \\\n",
       "0                          0                     0  ...          None   \n",
       "1                          0                     0  ...          None   \n",
       "2                          0                     1  ...          None   \n",
       "3                          0                     1  ...          None   \n",
       "4                          0                     1  ...          None   \n",
       "...                      ...                   ...  ...           ...   \n",
       "36729                      0                     0  ...          None   \n",
       "36730                      0                     0  ...          None   \n",
       "36731                      0                     0  ...          None   \n",
       "36732                      0                     0  ...          None   \n",
       "36733                      0                     0  ...          None   \n",
       "\n",
       "      dorothea_coexp  dorothea_level                type  curation_effort  \\\n",
       "0               None                  post_translational                2   \n",
       "1               None                  post_translational                1   \n",
       "2               None                  post_translational                3   \n",
       "3               None                  post_translational                3   \n",
       "4               None                  post_translational                3   \n",
       "...              ...             ...                 ...              ...   \n",
       "36729           None                  post_translational                1   \n",
       "36730           None                  post_translational                1   \n",
       "36731           None                  post_translational                1   \n",
       "36732           None                  post_translational                1   \n",
       "36733           None                  post_translational                1   \n",
       "\n",
       "                                             extra_attrs  ncbi_tax_id_source  \\\n",
       "0      {\"TRIP_method\":[\"Co-immunoprecipitation\",\"Co-i...                9606   \n",
       "1             {\"TRIP_method\":[\"Co-immunoprecipitation\"]}                9606   \n",
       "2      {\"TRIP_method\":[\"Fluorescence probe labeling\",...                9606   \n",
       "3      {\"TRIP_method\":[\"Fluorescence probe labeling\",...                9606   \n",
       "4      {\"TRIP_method\":[\"Fluorescence probe labeling\",...                9606   \n",
       "...                                                  ...                 ...   \n",
       "36729                                                 {}                9606   \n",
       "36730                                                 {}                9606   \n",
       "36731                                                 {}                9606   \n",
       "36732                                                 {}                9606   \n",
       "36733                                                 {}                9606   \n",
       "\n",
       "       entity_type_source  ncbi_tax_id_target  entity_type_target  \n",
       "0                 protein                9606             protein  \n",
       "1                 protein                9606             protein  \n",
       "2                 protein                9606             protein  \n",
       "3                 protein                9606             protein  \n",
       "4                 protein                9606             protein  \n",
       "...                   ...                 ...                 ...  \n",
       "36729             protein                9606             protein  \n",
       "36730             protein                9606             protein  \n",
       "36731             protein                9606             protein  \n",
       "36732             protein                9606             protein  \n",
       "36733             protein                9606             protein  \n",
       "\n",
       "[36734 rows x 34 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e.webservice_interactions_df()\n",
    "e.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Self interactions (loop edges) in the network\n",
    "\n",
    "Depending on the downstream application, loops might be beneficial or undesired. By default loops are disabled, but are enabled for OmniPath and the GRN networks among the built-in network databases. The `allow_loops` parameter can be set at the module level or at the instance level. If set at the module level, it will be valid for all subsequently created instances:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:32:52.478657Z",
     "start_time": "2023-03-10T19:32:52.475108Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.share import settings\n",
    "settings.setup(network_allow_loops = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If set at the instance level, it will be valid for the instance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:33:44.130709Z",
     "start_time": "2023-03-10T19:33:44.127457Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.core import network\n",
    "n = network.Network(allow_loops = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you want keep loops only for certain resources, load first the resources where loops should be removed, then remove the loops, and load the resources where you wish to keep the loops:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:56:41.126021Z",
     "start_time": "2023-03-10T19:54:16.673656Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "149"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import network\n",
    "from pypath import resources\n",
    "\n",
    "co = resources.get_controller()\n",
    "pw = co.collect_network('pathway')\n",
    "gr = co.collect_network('dorothea', interaction_types = 'transcriptional')\n",
    "\n",
    "n = network.Network(pw, allow_loops = False)\n",
    "n.load(gr, allow_loops = True)\n",
    "n.count_loops()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T19:59:10.313438Z",
     "start_time": "2023-03-10T19:58:53.813876Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'post_translational': 33571, 'transcriptional': 281262}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.count_interactions_by_interaction_type()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Molecular complexes in the network\n",
    "\n",
    "Currently `pypath` supports protein complexes, however, soon other kind of components, such as small molecules, nucleic acids, will be supported too. Complexes are represented by `pypath.internals.intera.Complex` objects, and can be network nodes. These objects optionally carry information about the defining resources, references, stoichiometry and custom attributes. Apart from the components and resources, none of these is mandatory. For more information, see the *Protein complexes* section in this notebook. Here we only show how complexes are included in networks. The `Network` object either represents each complex as a node (default behaviour), or expands the complex by creating a node for each of its components and apply all the interactions of the complex to all of its components. This latter method has adverse effects on network topology, and can be enabled by setting `network_expand_complexes` to `True`. Only a few resources list interactions of protein complexes, for example, SIGNOR, CollecTRI, Guide to Pharmacology, CellphoneDB, etc. Let's load such a resource:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:35:23.834073Z",
     "start_time": "2023-03-27T20:34:45.714084Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.core import network\n",
    "from pypath.resources import network as netres\n",
    "\n",
    "n = network.Network(netres.collectri)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can retrieve various information about the complexes in the network, e.g. count them:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:37:11.703026Z",
     "start_time": "2023-03-27T20:37:10.254886Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "33"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.count_complexes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or list them:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:37:34.340180Z",
     "start_time": "2023-03-27T20:37:32.840559Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{<Entity: FOS_JUN>,\n",
       " <Entity: FOS_JUNB>,\n",
       " <Entity: FOS_JUND>,\n",
       " <Entity: JUN>,\n",
       " <Entity: FOSL1_JUN>,\n",
       " <Entity: FOSL2_JUN>,\n",
       " <Entity: JUN_JUNB>,\n",
       " <Entity: JUN_JUND>,\n",
       " <Entity: FOSB_JUN>,\n",
       " <Entity: FOSL1_JUNB>,\n",
       " <Entity: FOSL1_JUND>,\n",
       " <Entity: FOSL2_JUNB>,\n",
       " <Entity: FOSL2_JUND>,\n",
       " <Entity: JUNB>,\n",
       " <Entity: JUNB_JUND>,\n",
       " <Entity: FOSB_JUNB>,\n",
       " <Entity: JUND>,\n",
       " <Entity: FOSB_JUND>,\n",
       " <Entity: NFKB1>,\n",
       " <Entity: NFKB1_NFKB2>,\n",
       " <Entity: NFKB1_RELB>,\n",
       " <Entity: NFKB1_RELA>,\n",
       " <Entity: NFKB1_REL>,\n",
       " <Entity: NFKB2>,\n",
       " <Entity: NFKB2_RELB>,\n",
       " <Entity: NFKB2_RELA>,\n",
       " <Entity: NFKB2_REL>,\n",
       " <Entity: RELB>,\n",
       " <Entity: RELA_RELB>,\n",
       " <Entity: REL_RELB>,\n",
       " <Entity: RELA>,\n",
       " <Entity: REL_RELA>,\n",
       " <Entity: REL>}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.get_complexes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the network, these are `Entity` objects, and their `identifier` attribute is the `Complex` object:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:39:53.590607Z",
     "start_time": "2023-03-27T20:39:52.186642Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Entity: REL_RELA>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cplex_entity = list(n.get_complexes())[0]\n",
    "cplex_entity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:40:32.226239Z",
     "start_time": "2023-03-27T20:40:32.222157Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Complex: COMPLEX:Q04206_Q04864"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cplex = cplex_entity.identifier\n",
    "cplex"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When creating a data frame, the complex objects are added to the identifier cells, where we used to have UniProt IDs for single proteins. The labels are the gene symbols of the components, separated by underscore by default."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:44:06.877819Z",
     "start_time": "2023-03-27T20:43:57.229179Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>source_genesymbol</th>\n",
       "      <th>target_genesymbol</th>\n",
       "      <th>is_directed</th>\n",
       "      <th>is_stimulation</th>\n",
       "      <th>is_inhibition</th>\n",
       "      <th>consensus_direction</th>\n",
       "      <th>consensus_stimulation</th>\n",
       "      <th>consensus_inhibition</th>\n",
       "      <th>sources</th>\n",
       "      <th>references</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(P17535, P15407)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL1_JUND</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:10022519;CollecTRI:10329043;CollecTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(P05412, P15408)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL2_JUN</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:10022519;CollecTRI:10329043;CollecTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(P05412, P15407)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL1_JUN</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:10022519;CollecTRI:10329043;CollecTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(P05412, P17275)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>JUN_JUNB</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:10022519;CollecTRI:10329043;CollecTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(P17275, P17535)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>JUNB_JUND</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:10022519;CollecTRI:10329043;CollecTR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54980</th>\n",
       "      <td>(P17535, P01100)</td>\n",
       "      <td>P01270</td>\n",
       "      <td>FOS_JUND</td>\n",
       "      <td>PTH</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:9989817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54981</th>\n",
       "      <td>(P17275, P15408)</td>\n",
       "      <td>P01270</td>\n",
       "      <td>FOSL2_JUNB</td>\n",
       "      <td>PTH</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:9989817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54982</th>\n",
       "      <td>(P05412, P53539)</td>\n",
       "      <td>P01270</td>\n",
       "      <td>FOSB_JUN</td>\n",
       "      <td>PTH</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:9989817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54983</th>\n",
       "      <td>(P17275, P15407)</td>\n",
       "      <td>P01270</td>\n",
       "      <td>FOSL1_JUNB</td>\n",
       "      <td>PTH</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:9989817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54984</th>\n",
       "      <td>(P17275)</td>\n",
       "      <td>P01270</td>\n",
       "      <td>JUNB</td>\n",
       "      <td>PTH</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>CollecTRI;ExTRI_CollecTRI</td>\n",
       "      <td>CollecTRI:9989817</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>23235 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 source  target source_genesymbol target_genesymbol  \\\n",
       "1      (P17535, P15407)  P04040        FOSL1_JUND               CAT   \n",
       "2      (P05412, P15408)  P04040         FOSL2_JUN               CAT   \n",
       "3      (P05412, P15407)  P04040         FOSL1_JUN               CAT   \n",
       "4      (P05412, P17275)  P04040          JUN_JUNB               CAT   \n",
       "5      (P17275, P17535)  P04040         JUNB_JUND               CAT   \n",
       "...                 ...     ...               ...               ...   \n",
       "54980  (P17535, P01100)  P01270          FOS_JUND               PTH   \n",
       "54981  (P17275, P15408)  P01270        FOSL2_JUNB               PTH   \n",
       "54982  (P05412, P53539)  P01270          FOSB_JUN               PTH   \n",
       "54983  (P17275, P15407)  P01270        FOSL1_JUNB               PTH   \n",
       "54984          (P17275)  P01270              JUNB               PTH   \n",
       "\n",
       "       is_directed  is_stimulation  is_inhibition  consensus_direction  \\\n",
       "1                1               1              0                    1   \n",
       "2                1               1              0                    1   \n",
       "3                1               1              0                    1   \n",
       "4                1               1              0                    1   \n",
       "5                1               1              0                    1   \n",
       "...            ...             ...            ...                  ...   \n",
       "54980            1               1              0                    1   \n",
       "54981            1               1              0                    1   \n",
       "54982            1               1              0                    1   \n",
       "54983            1               1              0                    1   \n",
       "54984            1               1              0                    1   \n",
       "\n",
       "       consensus_stimulation  consensus_inhibition                    sources  \\\n",
       "1                          1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "2                          1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "3                          1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "4                          1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "5                          1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "...                      ...                   ...                        ...   \n",
       "54980                      1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "54981                      1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "54982                      1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "54983                      1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "54984                      1                     0  CollecTRI;ExTRI_CollecTRI   \n",
       "\n",
       "                                              references  \n",
       "1      CollecTRI:10022519;CollecTRI:10329043;CollecTR...  \n",
       "2      CollecTRI:10022519;CollecTRI:10329043;CollecTR...  \n",
       "3      CollecTRI:10022519;CollecTRI:10329043;CollecTR...  \n",
       "4      CollecTRI:10022519;CollecTRI:10329043;CollecTR...  \n",
       "5      CollecTRI:10022519;CollecTRI:10329043;CollecTR...  \n",
       "...                                                  ...  \n",
       "54980                                  CollecTRI:9989817  \n",
       "54981                                  CollecTRI:9989817  \n",
       "54982                                  CollecTRI:9989817  \n",
       "54983                                  CollecTRI:9989817  \n",
       "54984                                  CollecTRI:9989817  \n",
       "\n",
       "[23235 rows x 12 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.omnipath import export\n",
    "from pypath.internals import intera\n",
    "\n",
    "e = export.Export(n)\n",
    "e.make_df(unique_pairs = False)\n",
    "e.df[[isinstance(s, intera.Complex) for s in e.df.source]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For some reason, `pandas` show the `Complex` objects as tuples. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:45:07.732756Z",
     "start_time": "2023-03-27T20:45:07.715568Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Complex: COMPLEX:P15407_P17535"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e.df[[isinstance(s, intera.Complex) for s in e.df.source]].source.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:48:51.770361Z",
     "start_time": "2023-03-27T20:48:10.690646Z"
    }
   },
   "outputs": [],
   "source": [
    "e.webservice_interactions_df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:50:14.917567Z",
     "start_time": "2023-03-27T20:50:14.878341Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>source_genesymbol</th>\n",
       "      <th>target_genesymbol</th>\n",
       "      <th>is_directed</th>\n",
       "      <th>is_stimulation</th>\n",
       "      <th>is_inhibition</th>\n",
       "      <th>consensus_direction</th>\n",
       "      <th>consensus_stimulation</th>\n",
       "      <th>consensus_inhibition</th>\n",
       "      <th>...</th>\n",
       "      <th>dorothea_tfbs</th>\n",
       "      <th>dorothea_coexp</th>\n",
       "      <th>dorothea_level</th>\n",
       "      <th>type</th>\n",
       "      <th>curation_effort</th>\n",
       "      <th>extra_attrs</th>\n",
       "      <th>ncbi_tax_id_source</th>\n",
       "      <th>entity_type_source</th>\n",
       "      <th>ncbi_tax_id_target</th>\n",
       "      <th>entity_type_target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>P01106</td>\n",
       "      <td>O14746</td>\n",
       "      <td>MYC</td>\n",
       "      <td>TERT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>75</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(P17535, P15407)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL1_JUND</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>14</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>complex</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(P05412, P15408)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL2_JUN</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>14</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>complex</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(P05412, P15407)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>FOSL1_JUN</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>14</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>complex</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(P05412, P17275)</td>\n",
       "      <td>P04040</td>\n",
       "      <td>JUN_JUNB</td>\n",
       "      <td>CAT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>14</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>complex</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67945</th>\n",
       "      <td>Q01196</td>\n",
       "      <td>Q13094</td>\n",
       "      <td>RUNX1</td>\n",
       "      <td>LCP2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67946</th>\n",
       "      <td>Q01196</td>\n",
       "      <td>Q6MZQ0</td>\n",
       "      <td>RUNX1</td>\n",
       "      <td>PRR5L</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67947</th>\n",
       "      <td>Q15672</td>\n",
       "      <td>P08151</td>\n",
       "      <td>TWIST1</td>\n",
       "      <td>GLI1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67948</th>\n",
       "      <td>P22415</td>\n",
       "      <td>Q5SRE5</td>\n",
       "      <td>USF1</td>\n",
       "      <td>NUP188</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67949</th>\n",
       "      <td>Q9UQR1</td>\n",
       "      <td>Q5VYX0</td>\n",
       "      <td>ZNF148</td>\n",
       "      <td>RNLS</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td></td>\n",
       "      <td>transcriptional</td>\n",
       "      <td>1</td>\n",
       "      <td>{}</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "      <td>9606</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>67950 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 source  target source_genesymbol target_genesymbol  \\\n",
       "0                P01106  O14746               MYC              TERT   \n",
       "1      (P17535, P15407)  P04040        FOSL1_JUND               CAT   \n",
       "2      (P05412, P15408)  P04040         FOSL2_JUN               CAT   \n",
       "3      (P05412, P15407)  P04040         FOSL1_JUN               CAT   \n",
       "4      (P05412, P17275)  P04040          JUN_JUNB               CAT   \n",
       "...                 ...     ...               ...               ...   \n",
       "67945            Q01196  Q13094             RUNX1              LCP2   \n",
       "67946            Q01196  Q6MZQ0             RUNX1             PRR5L   \n",
       "67947            Q15672  P08151            TWIST1              GLI1   \n",
       "67948            P22415  Q5SRE5              USF1            NUP188   \n",
       "67949            Q9UQR1  Q5VYX0            ZNF148              RNLS   \n",
       "\n",
       "       is_directed  is_stimulation  is_inhibition  consensus_direction  \\\n",
       "0                1               1              0                    1   \n",
       "1                1               1              0                    1   \n",
       "2                1               1              0                    1   \n",
       "3                1               1              0                    1   \n",
       "4                1               1              0                    1   \n",
       "...            ...             ...            ...                  ...   \n",
       "67945            1               1              0                    1   \n",
       "67946            1               1              0                    1   \n",
       "67947            1               1              0                    1   \n",
       "67948            1               1              0                    1   \n",
       "67949            1               1              0                    1   \n",
       "\n",
       "       consensus_stimulation  consensus_inhibition  ... dorothea_tfbs  \\\n",
       "0                          1                     0  ...          None   \n",
       "1                          1                     0  ...          None   \n",
       "2                          1                     0  ...          None   \n",
       "3                          1                     0  ...          None   \n",
       "4                          1                     0  ...          None   \n",
       "...                      ...                   ...  ...           ...   \n",
       "67945                      1                     0  ...          None   \n",
       "67946                      1                     0  ...          None   \n",
       "67947                      1                     0  ...          None   \n",
       "67948                      1                     0  ...          None   \n",
       "67949                      1                     0  ...          None   \n",
       "\n",
       "      dorothea_coexp  dorothea_level             type  curation_effort  \\\n",
       "0               None                  transcriptional               75   \n",
       "1               None                  transcriptional               14   \n",
       "2               None                  transcriptional               14   \n",
       "3               None                  transcriptional               14   \n",
       "4               None                  transcriptional               14   \n",
       "...              ...             ...              ...              ...   \n",
       "67945           None                  transcriptional                1   \n",
       "67946           None                  transcriptional                1   \n",
       "67947           None                  transcriptional                1   \n",
       "67948           None                  transcriptional                1   \n",
       "67949           None                  transcriptional                1   \n",
       "\n",
       "       extra_attrs  ncbi_tax_id_source  entity_type_source  \\\n",
       "0               {}                9606             protein   \n",
       "1               {}                9606             complex   \n",
       "2               {}                9606             complex   \n",
       "3               {}                9606             complex   \n",
       "4               {}                9606             complex   \n",
       "...            ...                 ...                 ...   \n",
       "67945           {}                9606             protein   \n",
       "67946           {}                9606             protein   \n",
       "67947           {}                9606             protein   \n",
       "67948           {}                9606             protein   \n",
       "67949           {}                9606             protein   \n",
       "\n",
       "       ncbi_tax_id_target  entity_type_target  \n",
       "0                    9606             protein  \n",
       "1                    9606             protein  \n",
       "2                    9606             protein  \n",
       "3                    9606             protein  \n",
       "4                    9606             protein  \n",
       "...                   ...                 ...  \n",
       "67945                9606             protein  \n",
       "67946                9606             protein  \n",
       "67947                9606             protein  \n",
       "67948                9606             protein  \n",
       "67949                9606             protein  \n",
       "\n",
       "[67950 rows x 34 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When we export to CSV, the `Complex` objects are converted to the string notation familiar from the OmniPath web service. See for example `COMPLEX:P15407_P17535` below, and its human readable label `FOSL1_JUND` in the gene symbols column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-27T20:55:26.975963Z",
     "start_time": "2023-03-27T20:55:26.761094Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,omnipath,kinaseextra,ligrecextra,pathwayextra,mirnatarget,dorothea,tf_target,lncrna_mrna,tf_mirna,small_molecule,dorothea_curated,dorothea_chipseq,dorothea_tfbs,dorothea_coexp,dorothea_level,type,curation_effort,extra_attrs,ncbi_tax_id_source,entity_type_source,ncbi_tax_id_target,entity_type_target\\nCOMPLEX:P15407_P17535,P04040,FOSL1_JUND,CAT,1,1,0,1,1,0,CollecTRI;ExTRI_CollecTRI,CollecTRI:10022519;CollecTRI:10329043;CollecTRI:12036993;CollecTRI:12538496;CollecTRI:17935786;CollecTRI:7489329;CollecTRI:7651432;CollecTRI:7818486;CollecTRI:8867782;CollecTRI:9030359;CollecTRI:9136992;CollecTRI:9142914;CollecTRI:9168892;CollecTRI:9687385,False,False,False,False,False,False,False,False,False,False,,,,,,transcriptional,14,{},9606,complex,9606,protein\\nCOMPLEX:P05412_P15408,P04040,FOSL2_JUN,CAT,1,1,0,1,1,0,CollecTRI;ExTRI_C"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1004 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "e.df[[ets == 'complex' for ets in e.df.entity_type_source]].to_csv(index = False)[:1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Translating identifiers <a class=\"anchor\" id=\"mapping\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `pypath.utils.mapping` module is for ID translation, most of the time you can simply call the `map_name` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:31:45.880927Z",
     "start_time": "2023-03-21T12:31:44.497799Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'EGFR'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import mapping\n",
    "mapping.map_name('P00533', 'uniprot', 'genesymbol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By default the `map_name` function returns a `set` because it accounts for ambiguous mapping. However most often the ID translation is unambiguous, and you want to retrieve only one ID. The `map_name0` returns a string, even in case of ambiguity, it returns a random element from the resulted set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:31.426141Z",
     "start_time": "2022-12-02T14:17:31.418263Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Q9BY60'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name0('GABARAPL3', 'genesymbol', 'uniprot')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Molecules have large variety of identifiers, but in pypath two identifier types are special:\n",
    "\n",
    "- The *primary identifier* defines the molecule category, e.g. if UniProt is the primary identifier for proteins, then a protein is anything that has a UniProt ID\n",
    "- The *label* is a human readable identifier, for proteins it's gene symbol\n",
    "    \n",
    "The primary ID and label types are configured for each molecule type (protein, miRNA, drug, etc) in the module settings.\n",
    "The `mapping` module provides shortcuts to translate between these identifiers: `label` and `id_from_label`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:33.521610Z",
     "start_time": "2022-12-02T14:17:33.516412Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ULK1'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.label('O75385')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:35.558354Z",
     "start_time": "2022-12-02T14:17:35.551050Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'O75385'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.id_from_label('ULK1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:37.948924Z",
     "start_time": "2022-12-02T14:17:37.943666Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'O75385'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.id_from_label0('ULK1')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Multiple IDs can be translated in one call, however, it's not possible to know certainly which output corresponds to which input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:40.210259Z",
     "start_time": "2022-12-02T14:17:40.200864Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'O75385', 'P00533', 'Q15796'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_names(['ULK1', 'EGFR', 'SMAD2'], 'genesymbol', 'uniprot')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The default organism is defined in the module settings, it is human by default. Translating for other organisms requires the `ncbi_tax_id` argument. Most of the functions in `pypath` accepts also common or latin names, but `map_name` accepts only numeric taxon IDs for efficiency. Let's translate a mouse identifier:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:44.115376Z",
     "start_time": "2022-12-02T14:17:44.106197Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q62432'}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name('Smad2', 'genesymbol', 'uniprot', ncbi_tax_id = 10090)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If no direct translation table is available between two ID types, `pypath` will try to translate by an intermediate ID type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:46.272012Z",
     "start_time": "2022-12-02T14:17:46.266050Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ULK1'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_name('8408', 'entrez', 'genesymbol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Behind the scenes the `chain_map` function is called:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:47.758541Z",
     "start_time": "2022-12-02T14:17:47.753420Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ULK1'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m = mapping.get_mapper()\n",
    "m.chain_map('8408', id_type = 'entrez', target_id_type = 'genesymbol', by_id_type = 'uniprot')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And the procedure corresponds to the following:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:17:49.119866Z",
     "start_time": "2022-12-02T14:17:49.113494Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ULK1'}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.map_names(\n",
    "    mapping.map_name('8408', 'entrez', 'uniprot'),\n",
    "    'uniprot',\n",
    "    'genesymbol',\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-defined ID translation tables\n",
    "\n",
    "A number of mapping tables are pre-defined, these load automatically on demand, and are removed from the memory if not used for some time (5 minutes by default). New mapping tables are saved directly into pickle files in the cache for a quick reload. Tables are either organism specific (hence loaded for each organism one-by-one), or non-organism specific, such as drug IDs (`pypath` uses integer `0` in this case in place of the numeric NCBI Taxonomy ID). The identifier translation data is retrieved from the following sources:\n",
    "\n",
    "- UniProt legacy API (main UniProt API until autumn 2022): `internals.input_formats.UniprotMapping`\n",
    "- UniProt uploadlists API (also outdated, replaced by the new UniProt API): `internals.inputs_formats.UniprotListMapping`\n",
    "- Ensembl Biomart: `internals.input_formats.BiomartMapping` and `internals.input_formats.ArrayMapping` (for microarray probes)\n",
    "- Protein Ontology Consortium: `internals.input_formats.ProMapping`\n",
    "- UniChem: `internals.input_formats.UnichemMapping`\n",
    "- Arbitrary files: `internals.input_formats.FileMapping` (this class is used to process data from miRBase, some files from the UniProt FTP site, and also user defined, custom cases)\n",
    "- RaMP: `internals.input_formats.RampMapping`\n",
    "- HMDB: `internals.input_formats.HmdbMapping`\n",
    "\n",
    "Some of the classes above are instantiated in `internals.maps`, but most of the instances are created on the fly when loading a mapping table in `utils.mapping.MapReader`. This latter class is responsible to take a table definition and load a `utils.mapping.MappingTable` instance. The whole process is managed by `utils.mapping.Mapper`, this is the object all the ID translation queries are dispatched to. It has a method to list the defined ID translation tables:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:32:06.248107Z",
     "start_time": "2023-03-21T12:32:06.239282Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[MappingTableDefinition(id_type_a='embl', id_type_b='uniprot', resource='uniprot', input_class='UniprotMapping', resource_id_type_a='database(embl)', resource_id_type_b='id'),\n",
       " MappingTableDefinition(id_type_a='genesymbol', id_type_b='uniprot', resource='uniprot', input_class='UniprotMapping', resource_id_type_a='genes(PREFERRED)', resource_id_type_b='id'),\n",
       " MappingTableDefinition(id_type_a='genesymbol-syn', id_type_b='uniprot', resource='uniprot', input_class='UniprotMapping', resource_id_type_a='genes(ALTERNATIVE)', resource_id_type_b='id'),\n",
       " MappingTableDefinition(id_type_a='entrez', id_type_b='uniprot', resource='uniprot', input_class='UniprotMapping', resource_id_type_a='database(geneid)', resource_id_type_b='id'),\n",
       " MappingTableDefinition(id_type_a='hgnc', id_type_b='uniprot', resource='uniprot', input_class='UniprotMapping', resource_id_type_a='database(HGNC)', resource_id_type_b='id'),\n",
       " MappingTableDefinition(id_type_a='refseqp', id_type_b='uniprot', resource='uniprot', input_cl"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 29850 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "mapping.mapping_tables()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pypath uses synonyms to refer to ID types: these are intended to be short, clear and lowercase for ease of use. Most of the synonyms are defined in `internals.input_formats`, in the `AC_QUERY`, `AC_MAPPING`, `BIOMART_MAPPING`, `PRO_MAPPING` and `ARRAY_MAPPING` dictionaries. UniChem ID types are used exactly as provided by UniChem. To list all available ID types (below *pypath* is the synonym used here, *original* is the name in the original resource):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:32:14.199394Z",
     "start_time": "2023-03-21T12:32:14.193094Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{IdType(pypath='CAS', original='CAS'),\n",
       " IdType(pypath='LIPIDMAPS', original='LIPIDMAPS'),\n",
       " IdType(pypath='MedChemExpress', original='MedChemExpress'),\n",
       " IdType(pypath='actor', original='actor'),\n",
       " IdType(pypath='affy', original='affy'),\n",
       " IdType(pypath='affymetrix', original='affymetrix'),\n",
       " IdType(pypath='agilent', original='agilent'),\n",
       " IdType(pypath='alzforum', original='Alzforum_mut'),\n",
       " IdType(pypath='araport', original='Araport'),\n",
       " IdType(pypath='atlas', original='atlas'),\n",
       " IdType(pypath='bigg', original='bigg'),\n",
       " IdType(pypath='bindingdb', original='bindingdb'),\n",
       " IdType(pypath='biocyc', original='biocyc'),\n",
       " IdType(pypath='brenda', original='brenda'),\n",
       " IdType(pypath='carotenoiddb', original='carotenoiddb'),\n",
       " IdType(pypath='cas', original='CAS'),\n",
       " IdType(pypath='cas', original='cas_registry_number'),\n",
       " IdType(pypath='cas_id', original='CAS'),\n",
       " IdType(pypath='cgnc', original='CGNC'),\n",
       " IdType(pypath='chebi', original='chebi'),\n",
       " IdType(pypath='chembl', original='chembl'),\n",
       " IdType(pypath='ch"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 8561 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "mapping.id_types()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Direct access to ID translation tables\n",
    "\n",
    "The `Mapper` (or the `mapping` module) is able to return ID translation tables as dicts or data frames:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:33:55.159598Z",
     "start_time": "2023-03-21T12:33:55.155714Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<MappingTable from=uniprot, to=genesymbol, taxon=9606 (20243 IDs)>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tbl = mapping.translation_dict('uniprot', 'genesymbol')\n",
    "tbl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:34:16.891809Z",
     "start_time": "2023-03-21T12:34:16.888382Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'P00533' in tbl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:34:25.636484Z",
     "start_time": "2023-03-21T12:34:25.632263Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'EGFR'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tbl['P00533']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:34:33.139034Z",
     "start_time": "2023-03-21T12:34:33.135146Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'EGFR' in tbl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:34:50.332172Z",
     "start_time": "2023-03-21T12:34:50.323273Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Q00604', {'NDP'}),\n",
       " ('Q9HB19', {'PLEKHA2'}),\n",
       " ('Q16718', {'NDUFA5'}),\n",
       " ('P55769', {'SNU13'}),\n",
       " ('Q92886', {'NEUROG1'}),\n",
       " ('Q6T4R5', {'NHS'}),\n",
       " ('P80188', {'LCN2'}),\n",
       " ('Q86XR2', {'FAM129C'}),\n",
       " ('Q5T2W1', {'PDZK1'}),\n",
       " ('Q9BSH3', {'NICN1'})]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(tbl.items())[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The same table as data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-21T12:35:18.225781Z",
     "start_time": "2023-03-21T12:35:18.204189Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uniprot</th>\n",
       "      <th>genesymbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q00604</td>\n",
       "      <td>NDP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Q9HB19</td>\n",
       "      <td>PLEKHA2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Q16718</td>\n",
       "      <td>NDUFA5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>P55769</td>\n",
       "      <td>SNU13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Q92886</td>\n",
       "      <td>NEUROG1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20375</th>\n",
       "      <td>Q96L92</td>\n",
       "      <td>SNX27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20376</th>\n",
       "      <td>Q9UNH6</td>\n",
       "      <td>SNX7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20377</th>\n",
       "      <td>Q5VWJ9</td>\n",
       "      <td>SNX30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20378</th>\n",
       "      <td>Q9BZZ2</td>\n",
       "      <td>SIGLEC1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20379</th>\n",
       "      <td>Q96BD0</td>\n",
       "      <td>SLCO4A1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>20380 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      uniprot genesymbol\n",
       "0      Q00604        NDP\n",
       "1      Q9HB19    PLEKHA2\n",
       "2      Q16718     NDUFA5\n",
       "3      P55769      SNU13\n",
       "4      Q92886    NEUROG1\n",
       "...       ...        ...\n",
       "20375  Q96L92      SNX27\n",
       "20376  Q9UNH6       SNX7\n",
       "20377  Q5VWJ9      SNX30\n",
       "20378  Q9BZZ2    SIGLEC1\n",
       "20379  Q96BD0    SLCO4A1\n",
       "\n",
       "[20380 rows x 2 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping.translation_df('uniprot', 'genesymbol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Orthology translation\n",
    "\n",
    "The `utils.orthology` module (formerly `utils.homology`) handles translation of data between organism by orthologous gene pairs. Its most important function is `translate`. The source organism is human by default, the target must be provided, below we use mouse (NCBI Taxonomy 10090):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:03:50.063128Z",
     "start_time": "2023-09-28T18:03:27.729650Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q01279'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import orthology\n",
    "orthology.translate('P00533', target = 10090)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ID translation and orthology translation are integrated, hence not only UniProt IDs can be translated:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:04:16.227314Z",
     "start_time": "2023-09-28T18:03:54.151630Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Egfr'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "orthology.translate('EGFR', target = 10090, id_type = 'genesymbol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This module uses data from the Orthologous Matrix )OMA), NCBI HomoloGene and Ensembl. The latter covers more organisms, and accepts some parameters (high confidence, one-to-one vs. one-to-many mapping). The default is to use only OMA as that one is the most comprehensive, up to date and easy to use resource. These parameters can be controlled by the settings module, or passed to the functions above and below, for example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:07:43.983305Z",
     "start_time": "2023-09-28T18:07:19.466364Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q01279'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "orthology.translate('P00533', target = 10090, oma = False, homologene = False, ensembl = True, ensembl_hc = False, ensembl_types = 'one2one')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Orthology translation tables as dictionaries\n",
    "\n",
    "The translation tables are available as dicts of sets, these are convenient for use outside of pypath:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:08:26.204826Z",
     "start_time": "2023-09-28T18:08:26.130931Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Egfr'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "human_mouse_genesymbols = orthology.get_dict(target = 'mouse', id_type = 'genesymbol')\n",
    "human_mouse_genesymbols['EGFR']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The relationship types and confdence levels can be included using the `full_records` argument:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:10:13.499398Z",
     "start_time": "2023-09-28T18:10:13.437657Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{OmaOrtholog(id='Egfr', rel_type='1:1', score=12704.5703125)}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "human_mouse_genesymbols = orthology.get_dict(target = 'mouse', id_type = 'genesymbol', full_records = True)\n",
    "human_mouse_genesymbols['EGFR']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Orthology translation data frames\n",
    "\n",
    "Similarly, `pandas.DataFrame`s are available:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-28T18:11:16.838576Z",
     "start_time": "2023-09-28T18:11:16.713567Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>rel_type</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>H4C3</td>\n",
       "      <td>H4c1</td>\n",
       "      <td>m:n</td>\n",
       "      <td>1262.050049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>H4C3</td>\n",
       "      <td>H4c3</td>\n",
       "      <td>m:n</td>\n",
       "      <td>1262.050049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>H4C3</td>\n",
       "      <td>H4c12</td>\n",
       "      <td>m:n</td>\n",
       "      <td>1262.050049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>H4C3</td>\n",
       "      <td>H4c11</td>\n",
       "      <td>m:n</td>\n",
       "      <td>1262.050049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>H4C3</td>\n",
       "      <td>H4c9</td>\n",
       "      <td>m:n</td>\n",
       "      <td>1262.050049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18446</th>\n",
       "      <td>GDAP2</td>\n",
       "      <td>Gdap2</td>\n",
       "      <td>1:1</td>\n",
       "      <td>5553.779785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18447</th>\n",
       "      <td>ITGA8</td>\n",
       "      <td>Itga8</td>\n",
       "      <td>1:1</td>\n",
       "      <td>10772.969727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18448</th>\n",
       "      <td>SEMA3F</td>\n",
       "      <td>Sema3f</td>\n",
       "      <td>1:1</td>\n",
       "      <td>9121.080078</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18449</th>\n",
       "      <td>EEPD1</td>\n",
       "      <td>Eepd1</td>\n",
       "      <td>1:1</td>\n",
       "      <td>5874.350098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18450</th>\n",
       "      <td>DRG2</td>\n",
       "      <td>Drg2</td>\n",
       "      <td>1:1</td>\n",
       "      <td>4423.589844</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>18451 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       source  target rel_type         score\n",
       "0        H4C3    H4c1      m:n   1262.050049\n",
       "1        H4C3    H4c3      m:n   1262.050049\n",
       "2        H4C3   H4c12      m:n   1262.050049\n",
       "3        H4C3   H4c11      m:n   1262.050049\n",
       "4        H4C3    H4c9      m:n   1262.050049\n",
       "...       ...     ...      ...           ...\n",
       "18446   GDAP2   Gdap2      1:1   5553.779785\n",
       "18447   ITGA8   Itga8      1:1  10772.969727\n",
       "18448  SEMA3F  Sema3f      1:1   9121.080078\n",
       "18449   EEPD1   Eepd1      1:1   5874.350098\n",
       "18450    DRG2    Drg2      1:1   4423.589844\n",
       "\n",
       "[18451 rows x 4 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "human_mouse_genesymbols = orthology.get_df(target = 'mouse', id_type = 'genesymbol', full_records = True)\n",
    "human_mouse_genesymbols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Taxonomy\n",
    "\n",
    "Organisms matter everywhere, both in the input, output and processing parts of pypath. For this reason we created a utility module to deal with translation of organism identifiers. We prefer NCBI Taxonomy IDs as the primary organism identifier. These are simple numbers, 9606 is human, 10090 is mouse, etc. Many databases use common English names or latin (scientific) names. Then some databases use custom codes, such as *hsapiens* in Ensmebl (first letter of genus name + species name, without space, all lowercase); *hsa* in miRBase and KEGG (first letter of genus name, first two letters of species name). The `pypath.utils.taxonomy` module features some convenient functions for handling all these names.\n",
    "\n",
    "### Translating to NCBI Taxonomy, scientific names and common names\n",
    "\n",
    "The most often used is `ensure_ncbi_tax_id`, which returns the NCBI Taxonomy ID for any comprehensible input:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:18:22.033384Z",
     "start_time": "2022-12-02T14:18:22.028246Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9606, 9606, 9606, 9606, 9606)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import taxonomy\n",
    "taxonomy.ensure_ncbi_tax_id('human'), taxonomy.ensure_ncbi_tax_id('H sapiens'), taxonomy.ensure_ncbi_tax_id('hsapiens'), taxonomy.ensure_ncbi_tax_id(9606), taxonomy.ensure_ncbi_tax_id('Homo sapiens')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To access scientific names or common names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:18:25.947472Z",
     "start_time": "2022-12-02T14:18:25.942020Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bos taurus'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "taxonomy.ensure_latin_name('cow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:18:27.214897Z",
     "start_time": "2022-12-02T14:18:27.091877Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'European robin'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "taxonomy.ensure_common_name('Erithacus rubecula')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Organism from UniProt ID\n",
    "\n",
    "The `uniprot_taxid` function returns the taxonomy ID for a SwissProt ID. Unfortunately it does not work for TrEMBL IDs, that would require to keep too much data in memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:18:30.046562Z",
     "start_time": "2022-12-02T14:18:28.853983Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Saccharomyces cerevisiae'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "taxonomy.ensure_latin_name(taxonomy.uniprot_taxid('P53104'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UniProt\n",
    "\n",
    "UniProt is a huge, diverse resource that is essential for *pypath* as we use it as a reference set for proteomes and it provides ID translation data. Its input module `pypath.inputs.uniprot` is already more complex than an average input module. It harbors a little database manager that loads and unloads tables on demand, ensuring fast and convenient operation. Further services are available in the `pypath.utils.uniprot` module."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The UniProt input module\n",
    "\n",
    "#### All UniProt IDs for one organism\n",
    "\n",
    "The complete set of UniProt IDs for an organism is considered to be the proteome of the organism, and it is used in many procedures across *pypath*. All SwissProt IDs, all TrEMBL IDs or both together can be retrieved:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:07:43.513053Z",
     "start_time": "2022-12-02T16:04:09.519841Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(86440, 17131, 69300)"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "(\n",
    "    len(iuniprot.all_uniprots(organism = 10090)),\n",
    "    len(iuniprot.all_swissprots(organism = 10090)),\n",
    "    len(iuniprot.all_trembls(organism = 10090)),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### UniProt ID format validation\n",
    "\n",
    "UniProt defines a format for its accessions, any string can be checked against this template to tell if it's possibly a valid ID:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:17:41.293811Z",
     "start_time": "2022-12-02T16:17:41.289834Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "iuniprot.valid_uniprot('A0A8D0H0C2')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### UniProt ID validation\n",
    "\n",
    "Another functions check if an ID indeed exists in UniProt. These functions require loading the list of all UniProt IDs for the organism, hence calling them the first time might take even a few minutes (in case new download is necessary). Subsequent calls will be much faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:17:44.423716Z",
     "start_time": "2022-12-02T16:17:44.419502Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "iuniprot.is_uniprot('P00533')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:14:14.625540Z",
     "start_time": "2022-12-02T16:14:14.615893Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iuniprot.is_swissprot('P00533')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If the organism doesn't match:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:15:07.867560Z",
     "start_time": "2022-12-02T16:15:07.842381Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iuniprot.is_uniprot('P00533', organism = 10090)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Single UniProt protein datasheet\n",
    "\n",
    "Raw contents of protein datasheets can be retrieved. The structure is a Python list with tuples of two elements, the first is the tag of the line, the second is the line content."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:18:06.048259Z",
     "start_time": "2022-12-02T16:18:05.975273Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ID', 'EGFR_HUMAN              Reviewed;        1210 AA.'),\n",
       " ('AC',\n",
       "  'P00533; O00688; O00732; P06268; Q14225; Q68GS5; Q92795; Q9BZS2; Q9GZX1;'),\n",
       " ('AC', 'Q9H2C9; Q9H3C9; Q9UMD7; Q9UMD8; Q9UMG5;'),\n",
       " ('DT', '21-JUL-1986, integrated into UniProtKB/Swiss-Prot.'),\n",
       " ('DT', '01-NOV-1997, sequence version 2.'),\n",
       " ('DT', '12-OCT-2022, entry version 283.'),\n",
       " ('DE', 'RecName: Full=Epidermal growth factor receptor {ECO:0000305};'),\n",
       " ('DE', 'EC=2.7.10.1;'),\n",
       " ('DE', 'AltName: Full=Proto-oncogene c-ErbB-1;'),\n",
       " ('DE', 'AltName: Full=Receptor tyrosine-protein kinase erbB-1;'),\n",
       " ('DE', 'Flags: Precursor;'),\n",
       " ('GN', 'Name=EGFR {ECO:0000312|HGNC:HGNC:3236}; Synonyms=ERBB, ERBB1, HER1;'),\n",
       " ('OS', 'Homo sapiens (Human).'),\n",
       " ('OC',\n",
       "  'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;'),\n",
       " ('OC',\n",
       "  'Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;'),\n",
       " ('OC', 'Homo.'),\n",
       " ('OX', 'NCBI_TaxID=9606;'),\n",
       " ('RN', '[1]'),\n",
       " ('RP',\n",
       "  'NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 58080 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "iuniprot.protein_datasheet('P00533')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### History of UniProt records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:21:15.449962Z",
     "start_time": "2022-12-02T16:21:15.440384Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[UniprotRecordHistory(entry_version='283', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2022_04', date='2022-10-12', replaces='', replaced_by=''),\n",
       " UniprotRecordHistory(entry_version='282', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2022_03', date='2022-08-03', replaces='', replaced_by=''),\n",
       " UniprotRecordHistory(entry_version='281', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2022_02', date='2022-05-25', replaces='', replaced_by=''),\n",
       " UniprotRecordHistory(entry_version='280', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2022_01', date='2022-02-23', replaces='', replaced_by=''),\n",
       " UniprotRecordHistory(entry_version='279', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2021_04', date='2021-09-29', replaces='', replaced_by=''),\n",
       " UniprotRecordHistory(entry_version='278', sequence_version='2', entry_name='EGFR_HUMAN', database='"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 50933 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "egfr_history = list(iuniprot.uniprot_history('P00533'))\n",
    "egfr_history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:21:57.760689Z",
     "start_time": "2022-12-02T16:21:57.755888Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "UniprotRecordHistory(entry_version='283', sequence_version='2', entry_name='EGFR_HUMAN', database='Swiss-Prot', number='2022_04', date='2022-10-12', replaces='', replaced_by='')"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iuniprot.uniprot_recent_version('P00533')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:22:33.284487Z",
     "start_time": "2022-12-02T16:22:32.688929Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ID', 'EGFR_HUMAN              Reviewed;        1210 AA.'),\n",
       " ('AC',\n",
       "  'P00533; O00688; O00732; P06268; Q14225; Q68GS5; Q92795; Q9BZS2; Q9GZX1;'),\n",
       " ('AC', 'Q9H2C9; Q9H3C9; Q9UMD7; Q9UMD8; Q9UMG5;'),\n",
       " ('DT', '21-JUL-1986, integrated into UniProtKB/Swiss-Prot.'),\n",
       " ('DT', '01-NOV-1997, sequence version 2.'),\n",
       " ('DT', '12-OCT-2022, entry version 283.'),\n",
       " ('DE', 'RecName: Full=Epidermal growth factor receptor {ECO:0000305};'),\n",
       " ('DE', 'EC=2.7.10.1;'),\n",
       " ('DE', 'AltName: Full=Proto-oncogene c-ErbB-1;'),\n",
       " ('DE', 'AltName: Full=Receptor tyrosine-protein kinase erbB-1;'),\n",
       " ('DE', 'Flags: Precursor;'),\n",
       " ('GN', 'Name=EGFR {ECO:0000312|HGNC:HGNC:3236}; Synonyms=ERBB, ERBB1, HER1;'),\n",
       " ('OS', 'Homo sapiens (Human).'),\n",
       " ('OC',\n",
       "  'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;'),\n",
       " ('OC',\n",
       "  'Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;'),\n",
       " ('OC', 'Homo.'),\n",
       " ('OX', 'NCBI_TaxID=9606;'),\n",
       " ('RN', '[1]'),\n",
       " ('RP',\n",
       "  'NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 58080 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_history_recent_datasheet('P00533')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The functions above are able to retrieve the latest datasheet of deleted UniProt records. However, they are slow as several queries are performed to process a single protein."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### UniProt REST API\n",
    "\n",
    "<a id=\"uniprot-data\"></a>\n",
    "UniProt deployed its new API in the autumn of 2022, since then *pypath* has fully transitioned to the new API. It is accessed by the `inputs.uniprot.uniprot_data` and `inputs.uniprot.uniprot_query` functions, though for some purposes higher level functions are more convenient for the users. For the functions above, a list of fields can be passed. By default it uses only SwissProt. The output is a dict of dicts with fields as top level keys and UniProt IDs as second level keys. The results often contain notes, additional info in parentheses, prefixes and postfixes for identifiers, that are not needed in every situation. Using `uniprot_preprocess` instead of `uniprot_data` cleans up some of this clutter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:24:10.019124Z",
     "start_time": "2023-11-16T03:23:41.546941Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'family': {'A0A087X1C5': 'Cytochrome P450 family',\n",
       "  'A0A0B4J2F2': 'Protein kinase superfamily, CAMK Ser/Thr protein kinase family, AMPK subfamily',\n",
       "  'A0A0K2S4Q6': 'CD300 family',\n",
       "  'A0A1B0GTW7': 'Peptidase M8 family',\n",
       "  'A0AV02': 'SLC12A transporter family',\n",
       "  'A0AV96': 'RRM RBM47 family',\n",
       "  'A0AVF1': 'IFT56 family',\n",
       "  'A0AVI4': 'TMEM129 family',\n",
       "  'A0AVK6': 'E2F/DP family',\n",
       "  'A0AVT1': 'Ubiquitin-activating E1 family',\n",
       "  'A0FGR8': 'Extended synaptotagmin family',\n",
       "  'A0FGR9': 'Extended synaptotagmin family',\n",
       "  'A0JLT2': 'Mediator complex subunit 19 family',\n",
       "  'A0JP26': 'POTE family',\n",
       "  'A0MZ66': 'Shootin family',\n",
       "  'A0PJK1': 'Sodium:solute symporter (SSF) (TC 2.A.21) family',\n",
       "  'A0PJY2': 'Krueppel C2H2-type zinc-finger protein family',\n",
       "  'A0PK00': 'TMEM120 family',\n",
       "  'A0PK11': 'Clarin family',\n",
       "  'A1A4Y4': 'TRAFAC class dynamin-like GTPase superfamily, IRG family',\n",
       "  'A1A519': 'FAM170 family',\n",
       "  'A1A5B4': 'Anoctamin family',\n",
       "  'A1A5C7': 'Major facilitator (TC 2.A.1) superfamily, Orga"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 510530 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "iuniprot.uniprot_data(fields = ('family', 'keywords', 'transmembrane'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `inputs.uiprot.query_builder` funcion builds queries for the API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:30:18.713740Z",
     "start_time": "2023-11-16T03:30:18.708656Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'kinase AND organism_id:9606'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot\n",
    "uniprot.query_builder('kinase', organism_id = 9606)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:30:49.214855Z",
     "start_time": "2023-11-16T03:30:49.209908Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(organism_id:9606 OR organism_id:10090 OR organism_id:10116)'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder(organism = [9606, 10090, 10116])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:31:22.912864Z",
     "start_time": "2023-11-16T03:31:22.903597Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(organism_id:9606 AND reviewed:true)'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder({'organism_id': 9606, 'reviewed': True})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:31:41.015255Z",
     "start_time": "2023-11-16T03:31:41.008577Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(length:[500 TO *] OR mass:[50000 TO *])'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:32:21.260232Z",
     "start_time": "2023-11-16T03:32:21.251091Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(lit_author:Huang AND lit_author:Kovac)'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder(lit_author = ['Huang', 'Kovac', '_AND'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:32:41.175601Z",
     "start_time": "2023-11-16T03:32:41.170799Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'((organism_id:9606 OR organism_id:10090) AND reviewed:true)'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder({'organism_id': [9606, 10090], 'reviewed': True})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:33:04.164851Z",
     "start_time": "2023-11-16T03:33:04.159090Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(length:[100 TO *] AND organism_id:9606)'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.query_builder({'length': (100, None), 'organism_id': 9606})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The query parameters can be passed the same way to `uniprot_data` and `uniprot_query`. For example, to query records in one proteome:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-16T03:36:16.487881Z",
     "start_time": "2023-11-16T03:36:16.451042Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['D1YM56',\n",
       " 'D1YMJ2',\n",
       " 'D1YN32',\n",
       " 'D1YNB3',\n",
       " 'D1YPZ1',\n",
       " 'D1YR07',\n",
       " 'D1YR15',\n",
       " 'D1YR93',\n",
       " 'D1YRB4',\n",
       " 'D1YRB7']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot\n",
    "uniprot.uniprot_query(proteome = 'UP000004102')[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All these functionalities are performed by the `pypath.inputs.uniprot.UniprotQuery` class."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Processed UniProt annotations\n",
    "\n",
    "For a few important fields we have dedicated processing functions with the aim of making their format cleaner and better usable. Sometimes even these do an imperfect job, and certain fields are badly truncated or contain residual fragments of the stripped labels.\n",
    "\n",
    "<div class=\"alert alert-block alert-success\"><b>Note:</b> All the data presented below is part of the OmniPath annotations database, the recommended way to access it is <a href=\"an-dbmanager\">by the database manager</a>.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:40:33.425325Z",
     "start_time": "2022-12-02T16:40:32.692822Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'P00521': {'Abelson murine leukemia virus'},\n",
       " 'P03333': {'Abelson murine leukemia virus'},\n",
       " 'H8ZM73': {'Abies balsamea', 'Balsam fir', 'Pinus balsamea'},\n",
       " 'H8ZM71': {'Abies balsamea', 'Balsam fir', 'Pinus balsamea'},\n",
       " 'Q9MV51': {'Abies firma', 'Momi fir'},\n",
       " 'O81086': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'O24474': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'O24475': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'O64404': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'O64405': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'Q948Z0': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'Q9M7D1': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'Q9M7D0': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'O22340': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'Q9M7C9': {'Abies grandis', 'Grand fir', 'Pinus grandis'},\n",
       " 'Q5K3V1': {'Abies homolepis', 'Nikko fir'},\n",
       " 'P21715': {'Abrothrix jelskii', 'Akodon jelskii', \"Jelski's altiplano mouse\"},\n",
       " 'P11140': {'Abru"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 56985 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.inputs import uniprot as iuniprot\n",
    "iuniprot.uniprot_taxonomy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:42:33.824020Z",
     "start_time": "2022-12-02T16:42:33.704114Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{648330: Taxon(ncbi_id=648330, latin='Aedes albopictus densovirus (isolate Boublik/1994)', english='AalDNV', latin_synonym=None),\n",
       " 10804: Taxon(ncbi_id=10804, latin='Adeno-associated virus 2', english='AAV-2', latin_synonym=None),\n",
       " 648242: Taxon(ncbi_id=648242, latin='Adeno-associated virus 2 (isolate Srivastava/1982)', english='AAV-2', latin_synonym=None),\n",
       " 118452: Taxon(ncbi_id=118452, latin='Abacion magnum', english='Millipede', latin_synonym=None),\n",
       " 72259: Taxon(ncbi_id=72259, latin='Abaeis nicippe', english='Sleepy orange butterfly', latin_synonym='Eurema nicippe'),\n",
       " 102642: Taxon(ncbi_id=102642, latin='Abax parallelepipedus', english='Ground beetle', latin_synonym=None),\n",
       " 392897: Taxon(ncbi_id=392897, latin='Abalistes stellaris', english='Starry triggerfish', latin_synonym='Balistes stellaris'),\n",
       " 75332: Taxon(ncbi_id=75332, latin='Abbottina rivularis', english='Chinese false gudgeon', latin_synonym='Gobio rivularis'),\n",
       " 515833: Taxon(ncbi_id=515833, latin='Abdopus aculeatus', engl"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 118050 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_ncbi_taxids_2()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:42:50.572753Z",
     "start_time": "2022-12-02T16:42:50.198376Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q96EC8': {UniprotLocation(location='Golgi apparatus membrane', features=('Multi-pass membrane protein',))},\n",
       " 'Q6ZMS4': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q8N8L2': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q15916': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q3MIS6': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q6P280': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q969W1': {UniprotLocation(location='Endoplasmic reticulum membrane', features=('Multi-pass membrane protein',))},\n",
       " 'O14978': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q66K41': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q15937': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q9P2J8': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q8ND82': {UniprotLocation(location='Nucleus', features=None)},\n",
       " 'Q9NP64': {UniprotLocation(location='Nucleolus', features=None),\n",
       "  UniprotLocation(location='Nucleus', features=None)},\n",
       " 'P"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 143466 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_locations()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:43:06.919185Z",
     "start_time": "2022-12-02T16:43:06.594985Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'P63120': {UniprotKeyword(keyword='Aspartyl protease'),\n",
       "  UniprotKeyword(keyword='Autocatalytic cleavage'),\n",
       "  UniprotKeyword(keyword='ERV'),\n",
       "  UniprotKeyword(keyword='Hydrolase'),\n",
       "  UniprotKeyword(keyword='Protease'),\n",
       "  UniprotKeyword(keyword='Reference proteome'),\n",
       "  UniprotKeyword(keyword='Ribosomal frameshifting'),\n",
       "  UniprotKeyword(keyword='Transposable element')},\n",
       " 'Q96EC8': {UniprotKeyword(keyword='Acetylation'),\n",
       "  UniprotKeyword(keyword='Alternative splicing'),\n",
       "  UniprotKeyword(keyword='Golgi apparatus'),\n",
       "  UniprotKeyword(keyword='Membrane'),\n",
       "  UniprotKeyword(keyword='Phosphoprotein'),\n",
       "  UniprotKeyword(keyword='Reference proteome'),\n",
       "  UniprotKeyword(keyword='Transmembrane'),\n",
       "  UniprotKeyword(keyword='Transmembrane helix')},\n",
       " 'Q6ZMS4': {UniprotKeyword(keyword='Metal-binding'),\n",
       "  UniprotKeyword(keyword='Nucleus'),\n",
       "  UniprotKeyword(keyword='Phosphoprotein'),\n",
       "  UniprotKeyword(keyword='Reference proteome'),\n",
       "  UniprotKeyword(keyword='Repeat'),\n",
       "  UniprotKeyword(keyword='Zinc'),\n",
       "  Unipro"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 445111 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_keywords()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:43:22.297672Z",
     "start_time": "2022-12-02T16:43:22.216049Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'P63120': {UniprotFamily(family='Peptidase A2', subfamily='HERV class-II K(HML-2)')},\n",
       " 'Q96EC8': {UniprotFamily(family='YIP1', subfamily=None)},\n",
       " 'Q6ZMS4': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q8N8L2': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q3MIS6': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q86UK7': {UniprotFamily(family='ZNF598', subfamily=None)},\n",
       " 'Q6P280': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q969W1': {UniprotFamily(family='DHHC palmitoyltransferase', subfamily=None)},\n",
       " 'O14978': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q15937': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q9P2J8': {UniprotFamily(family='Krueppel C2H2-type zinc-finger protein', subfamily=None)},\n",
       " 'Q8IUH4': {UniprotFamily(family='DHHC palmitoyltransferase', "
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 77892 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_families()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:43:55.689899Z",
     "start_time": "2022-12-02T16:43:54.569998Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q15916': {UniprotTissue(tissue='Brain', level='high'),\n",
       "  UniprotTissue(tissue='Wide', level='high')},\n",
       " 'Q969W1': {UniprotTissue(tissue='Wide', level='undefined')},\n",
       " 'O14978': {UniprotTissue(tissue='Brain', level='undefined'),\n",
       "  UniprotTissue(tissue='Colon', level='undefined'),\n",
       "  UniprotTissue(tissue='Heart', level='undefined'),\n",
       "  UniprotTissue(tissue='Kidney', level='undefined'),\n",
       "  UniprotTissue(tissue='Leukocyte', level='undefined'),\n",
       "  UniprotTissue(tissue='Liver', level='undefined'),\n",
       "  UniprotTissue(tissue='Lung', level='undefined'),\n",
       "  UniprotTissue(tissue='Ovary', level='undefined'),\n",
       "  UniprotTissue(tissue='Pancreas', level='undefined'),\n",
       "  UniprotTissue(tissue='Placenta', level='undefined'),\n",
       "  UniprotTissue(tissue='Prostate', level='undefined'),\n",
       "  UniprotTissue(tissue='Skeletal muscle', level='undefined'),\n",
       "  UniprotTissue(tissue='Small intestine', level='undefined'),\n",
       "  UniprotTissue(tissue='Spleen', level='undefined'),\n",
       "  UniprotTissue(tissue='Testis', level='undefined'),\n",
       "  Uniprot"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 318790 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_tissues()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T16:44:13.937512Z",
     "start_time": "2022-12-02T16:44:13.640392Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q96EC8': {UniprotTopology(topology='Cytoplasmic', start=2, end=84),\n",
       "  UniprotTopology(topology='Cytoplasmic', start=137, end=146),\n",
       "  UniprotTopology(topology='Cytoplasmic', start=206, end=212),\n",
       "  UniprotTopology(topology='Lumenal', start=106, end=115),\n",
       "  UniprotTopology(topology='Lumenal', start=168, end=184),\n",
       "  UniprotTopology(topology='Lumenal', start=234, end=236),\n",
       "  UniprotTopology(topology='Transmembrane', start=85, end=105),\n",
       "  UniprotTopology(topology='Transmembrane', start=116, end=136),\n",
       "  UniprotTopology(topology='Transmembrane', start=147, end=167),\n",
       "  UniprotTopology(topology='Transmembrane', start=185, end=205),\n",
       "  UniprotTopology(topology='Transmembrane', start=213, end=233)},\n",
       " 'Q969W1': {UniprotTopology(topology='Cytoplasmic', start=1, end=77),\n",
       "  UniprotTopology(topology='Cytoplasmic', start=138, end=198),\n",
       "  UniprotTopology(topology='Cytoplasmic', start=288, end=377),\n",
       "  UniprotTopology(topology='Lumenal', start=99, end=116),\n",
       "  UniprotTopology(topology='Lumenal', start=220,"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 544230 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "iuniprot.uniprot_topology()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The UniProt utils module\n",
    "\n",
    "\n",
    "#### Datasheets\n",
    "\n",
    "The `pypath.utils.uniprot` module is an API around UniProt protein datasheets. It is not suitable for bulk retrieval: that would work but take really long time. Calling its bulk methods with more than a few dozens or hundreds of proteins might take minutes, as it downloads protein datasheets one-by-one. To retrieve the full datasheets of one or more proteins use `query`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T17:57:18.978720Z",
     "start_time": "2022-12-02T17:57:18.147010Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<UniProt datasheet P00533 (EGFR)>,\n",
       " <UniProt datasheet O75385 (ULK1)>,\n",
       " <UniProt datasheet Q14457 (BECN1)>]"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import uniprot\n",
    "uniprot.query('P00533', 'O75385', 'Q14457')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T17:57:58.735412Z",
     "start_time": "2022-12-02T17:57:58.727283Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<UniProt datasheet O75385 (ULK1)>"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ulk1 = uniprot.query('O75385')\n",
    "ulk1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Many attributes are available from the datasheet objects, just a few examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T17:59:18.446729Z",
     "start_time": "2022-12-02T17:59:18.438757Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(112631,\n",
       " 1050,\n",
       " 'Cytoplasm, cytosol. Preautophagosomal structure. Note=Under starvation conditions, is localized to puncate structures primarily representing the isolation membrane that sequesters a portion of the cytoplasm resulting in the formation of an autophagosome.',\n",
       " 'MEPGRGGTETVGKFEFSRKDLIGHGAFAVVFKGRHREKHDLEVAVKCINKKNLAKSQTLLGKEIKILKELKHENIVALYDFQEMANSVYLVMEYCNGGDLADYLHAMRTLSEDTIRLFLQQIAGAMRLLHSKGIIHRDLKPQNILLSNPAGRRANPNSIRVKIADFGFARYLQSNMMAATLCGSPMYMAPEVIMSQHYDGKADLWSIGTIVYQCLTGKAPFQASSPQDLRLFYEKNKTLVPTIPRETSAPLRQLLLALLQRNHKDRMDFDEFFHHPFLDASPSVRKSPPVPVPSYPSSGSGSSSSSSSTSHLASPPSLGEMQQLQKTLASPADTAGFLHSSRDSGGSKDSSCDTDDFVMVPAQFPGDLVAEAPSAKPPPDSLMCSGSSLVASAGLESHGRTPSPSPPCSSSPSPSGRAGPFSSSRCGASVPIPVPTQVQNYQRIERNLQSPTQFQTPRSSAIRRSGSTSPLGFARASPSPPAHAEHGGVLARKMSLGGGRPYTPSPQVGTIPERPGWSGTPSPQGAEMRGGRSPRPGSSAPEHSPRTSGLGCRLHSAPNLSDLHVVRPKLPKPPTDPLGAVFSPPQASPPQPSHGLQSCRNLRGSPKLPDFLQRNPLPPILGSPTKAVPSFDFPKTPSSQNLLALLARQGVVMTPPRNRTLPDLSEVGPFHGQPLGPGLRPGEDPKGPFGRSFSTSRLTDLLLKAAFGTQAPDPGSTESLQEK"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1329 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ulk1.weight, ulk1.length, ulk1.subcellular_location, ulk1.sequence"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `collect` function collects certain features for a set of proteins.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> This is a really inefficient way of retrieving data from UniProt. If you work with more than a handful of proteins, go for <em><a href=\"uniprot-data\">pypath.inputs.uniprot_data</a></em> instead.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T18:02:29.957514Z",
     "start_time": "2022-12-02T18:02:29.937820Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OrderedDict([('ac', ['P00533', 'O75385', 'Q14457']),\n",
       "             ('weight', [134277, 112631, 51896]),\n",
       "             ('length', [1210, 1050, 450])])"
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniprot.collect(['P00533', 'O75385', 'Q14457'], 'weight', 'length')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tables\n",
    "\n",
    "UniProt data can be printed to the console in a tabular format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T18:07:18.462634Z",
     "start_time": "2022-12-02T18:07:18.443096Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "╒═══════╤════════╤══════════╤══════════╕\n",
      "│   No. │ ac     │   weight │   length │\n",
      "╞═══════╪════════╪══════════╪══════════╡\n",
      "│     1 │ P00533 │   134277 │     1210 │\n",
      "├───────┼────────┼──────────┼──────────┤\n",
      "│     2 │ O75385 │   112631 │     1050 │\n",
      "├───────┼────────┼──────────┼──────────┤\n",
      "│     3 │ Q14457 │    51896 │      450 │\n",
      "╘═══════╧════════╧══════════╧══════════╛\n"
     ]
    }
   ],
   "source": [
    "uniprot.print_features(['P00533', 'O75385', 'Q14457'], 'weight', 'length')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There is a shortcut to print essential characterization of proteins as such a table. The `info` function is really useful if you get to a set of proteins at some point of your analysis and you want to quickly check what kind they are. To iterate through multiple groups of proteins, use `utils.uniprot.browse`. The columns and format of these tables can be customized by `kwargs`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T18:09:45.056825Z",
     "start_time": "2022-12-02T18:09:45.039305Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====> [3 proteins] <=====\n",
      "╒═══════╤════════╤══════════════╤══════════╤══════════╤═════════════╤══════════════╤════════════╤══════════════╕\n",
      "│   No. │ ac     │ genesymbol   │   length │   weight │ full_name   │ function_o   │ keywords   │ subcellula   │\n",
      "│       │        │              │          │          │             │ r_genecard   │            │ r_location   │\n",
      "│       │        │              │          │          │             │ s            │            │              │\n",
      "╞═══════╪════════╪══════════════╪══════════╪══════════╪═════════════╪══════════════╪════════════╪══════════════╡\n",
      "│     1 │ P00533 │ EGFR         │     1210 │   134277 │ Epidermal   │ Receptor     │ 3D-        │ Cell         │\n",
      "│       │        │              │          │          │ growth      │ tyrosine     │ structure, │ membrane;    │\n",
      "│       │        │              │          │          │ factor      │ kinase       │ Alternativ │ Single-      │\n",
      "│       │        │              │          │          │ receptor    │"
     ]
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 20254 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "uniprot.info(['P00533', 'O75385', 'Q14457'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sanitizing UniProt IDs\n",
    "\n",
    "It is important to know that the ID translation module always do a number of checks when translating to UniProt IDs. Unless the `uniprot_cleanup` parameter is disabled. It translates secondary IDs to primary, attempts to map TrEMBL IDs to SwissProts by gene symbols, removes IDs of other organisms or invalid format. To exploit this behaviour it's enough to map from UniProt to UniProt:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T18:20:02.181656Z",
     "start_time": "2022-12-02T18:20:02.177475Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'O75385'}"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import mapping\n",
    "mapping.map_name('Q9UQ28', 'uniprot', 'uniprot')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Enzyme-substrate interactions <a class=\"anchor\" id=\"enz-sub\"></a>\n",
    "\n",
    "The database is an instance of `pypath.core.enz_sub.EnzymeSubstrateAggregator` class. The database is built with the default or current configuration by the `core.enz_sub.get_db` method.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> it is recommended to access databases <a href=\"#es-dbmanager\">by the manager</a>. Running the code below takes really long and does not save or reload the database, it builds a fresh copy each time.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:26:37.442882Z",
     "start_time": "2022-12-02T14:18:35.635700Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.core import enz_sub\n",
    "es = enz_sub.get_db()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instead, let's acquire the database from the manager:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:37:33.078304Z",
     "start_time": "2022-12-03T15:37:25.808193Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath import omnipath\n",
    "es = omnipath.db.get_db('enz_sub')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The database itself is stored as a dictionary (`EnzymeSubstrateAggregator.enz_sub`) with pairs of proteins as keys and a list of special objects representing enzyme-substrate interactions as values. These can be accessed by pairs of labels, identifiers or `Entity` objects, e.g. mTOR phosphorylates AKT1:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:40:55.408662Z",
     "start_time": "2022-12-02T14:40:55.144764Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<MTOR => Residue AKT1-1:S473:phosphorylation [Evidences: HPRD, KEA, MIMP, PhosphoSite, ProtMapper, REACH, SIGNOR, Sparser, dbPTM, phosphoELM (15 references)]>,\n",
       " <MTOR => Residue AKT1-1:T450:phosphorylation [Evidences: HPRD, MIMP, PhosphoSite, ProtMapper, phosphoELM (0 references)]>,\n",
       " <MTOR => Residue AKT1-1:T308:phosphorylation [Evidences: ProtMapper, Sparser (1 references)]>]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es[('MTOR', 'AKT1')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Enzyme-substrate objects\n",
    "\n",
    "Let's take a closer look at one of the enzyme-PTM relationships, represented by `pypath.internals.intera.DomainMotif` objects. Below some of the attributes are shown:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:40:57.616453Z",
     "start_time": "2022-12-02T14:40:57.611744Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<Entity: AKT1>,\n",
       " 'P31749',\n",
       " 1,\n",
       " <Residue AKT1-1:S473>,\n",
       " 'S',\n",
       " 473,\n",
       " 'phosphorylation',\n",
       " <Entity: MTOR>)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e_ptm = es[('MTOR', 'AKT1')][0]\n",
    "e_ptm.ptm.protein, e_ptm.ptm.protein.identifier, e_ptm.ptm.isoform, e_ptm.ptm.residue, e_ptm.ptm.residue.name, e_ptm.ptm.residue.number, e_ptm.ptm.typ, e_ptm.domain.protein"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The resources and references are available in `Evidences` objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:41:00.590338Z",
     "start_time": "2022-12-02T14:41:00.583559Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Evidences: HPRD, KEA, MIMP, PhosphoSite, ProtMapper, REACH, SIGNOR, Sparser, dbPTM, phosphoELM (15 references)>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e_ptm.evidences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:41:03.165879Z",
     "start_time": "2022-12-02T14:41:03.157644Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'KEA', 'MIMP', 'PhosphoSite', 'ProtMapper', 'SIGNOR', 'dbPTM'}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e_ptm.evidences.get_resource_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T14:41:04.493144Z",
     "start_time": "2022-12-02T14:41:04.485845Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{<Reference: 14761976>,\n",
       " <Reference: 15047712>,\n",
       " <Reference: 15364915>,\n",
       " <Reference: 15718470>,\n",
       " <Reference: 15899889>,\n",
       " <Reference: 16221682>,\n",
       " <Reference: 17013611>,\n",
       " <Reference: 19844585>,\n",
       " <Reference: 20333297>,\n",
       " <Reference: 20489726>,\n",
       " <Reference: 21157483>,\n",
       " <Reference: 21592956>,\n",
       " <Reference: 23006971>,\n",
       " <Reference: 8978681>,\n",
       " <Reference: 9736715>}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e_ptm.evidences.get_references()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Enzyme-substrate data frame\n",
    "\n",
    "The dabase object is able to export its contents into a `pandas.DataFrame`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:37:39.891782Z",
     "start_time": "2022-12-03T15:37:38.858761Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>enzyme</th>\n",
       "      <th>enzyme_genesymbol</th>\n",
       "      <th>substrate</th>\n",
       "      <th>substrate_genesymbol</th>\n",
       "      <th>isoforms</th>\n",
       "      <th>residue_type</th>\n",
       "      <th>residue_offset</th>\n",
       "      <th>modification</th>\n",
       "      <th>sources</th>\n",
       "      <th>references</th>\n",
       "      <th>curation_effort</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>P31749</td>\n",
       "      <td>AKT1</td>\n",
       "      <td>P63104</td>\n",
       "      <td>YWHAZ</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "      <td>58</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>HPRD;HPRD_MIMP;KEA;MIMP;PhosphoSite;PhosphoSit...</td>\n",
       "      <td>HPRD:11956222;KEA:11956222;KEA:12861023;KEA:16...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>P31749</td>\n",
       "      <td>AKT1</td>\n",
       "      <td>P63104</td>\n",
       "      <td>YWHAZ</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "      <td>184</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>HPRD;HPRD_MIMP;KEA;MIMP;PhosphoSite_MIMP;phosp...</td>\n",
       "      <td>HPRD:11956222;KEA:11956222;KEA:15071501</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>P45983</td>\n",
       "      <td>MAPK8</td>\n",
       "      <td>P63104</td>\n",
       "      <td>YWHAZ</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "      <td>184</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>HPRD;HPRD_MIMP;KEA;MIMP;PhosphoNetworks;Phosph...</td>\n",
       "      <td>HPRD:15696159;KEA:11956222;KEA:15071501;KEA:15...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>P06493</td>\n",
       "      <td>CDK1</td>\n",
       "      <td>P11171</td>\n",
       "      <td>EPB41</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "      <td>712</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>HPRD_MIMP;MIMP;PhosphoSite_MIMP;ProtMapper;REA...</td>\n",
       "      <td>ProtMapper:15525677;dbPTM:15525677;dbPTM:18220...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>P06493</td>\n",
       "      <td>CDK1</td>\n",
       "      <td>P11171</td>\n",
       "      <td>EPB41</td>\n",
       "      <td>1;2;5;7</td>\n",
       "      <td>T</td>\n",
       "      <td>60</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>MIMP;PhosphoSite;PhosphoSite_MIMP;ProtMapper;R...</td>\n",
       "      <td>ProtMapper:15525677;dbPTM:15525677;dbPTM:2171679</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41421</th>\n",
       "      <td>P29597</td>\n",
       "      <td>TYK2</td>\n",
       "      <td>P51692</td>\n",
       "      <td>STAT5B</td>\n",
       "      <td>1</td>\n",
       "      <td>Y</td>\n",
       "      <td>699</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>KEA</td>\n",
       "      <td>KEA:10830280;KEA:11751923;KEA:12411494</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41422</th>\n",
       "      <td>Q06418</td>\n",
       "      <td>TYRO3</td>\n",
       "      <td>P19174</td>\n",
       "      <td>PLCG1</td>\n",
       "      <td>1;2</td>\n",
       "      <td>Y</td>\n",
       "      <td>771</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>KEA</td>\n",
       "      <td>KEA:12601080;KEA:15144186;KEA:15592455;KEA:160...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41423</th>\n",
       "      <td>Q9H4A3</td>\n",
       "      <td>WNK1</td>\n",
       "      <td>Q8TAX0</td>\n",
       "      <td>OSR1</td>\n",
       "      <td>1</td>\n",
       "      <td>T</td>\n",
       "      <td>185</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>KEA</td>\n",
       "      <td>KEA:18270262</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41424</th>\n",
       "      <td>Q9H4A3</td>\n",
       "      <td>WNK1</td>\n",
       "      <td>Q96J92</td>\n",
       "      <td>WNK4</td>\n",
       "      <td>1;3</td>\n",
       "      <td>S</td>\n",
       "      <td>335</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>KEA</td>\n",
       "      <td>KEA:15883153</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41425</th>\n",
       "      <td>Q9NYL2</td>\n",
       "      <td>MAP3K20</td>\n",
       "      <td>Q92903</td>\n",
       "      <td>CDS1</td>\n",
       "      <td>1</td>\n",
       "      <td>T</td>\n",
       "      <td>68</td>\n",
       "      <td>phosphorylation</td>\n",
       "      <td>KEA</td>\n",
       "      <td>KEA:10973490</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41426 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       enzyme enzyme_genesymbol substrate substrate_genesymbol isoforms  \\\n",
       "0      P31749              AKT1    P63104                YWHAZ        1   \n",
       "1      P31749              AKT1    P63104                YWHAZ        1   \n",
       "2      P45983             MAPK8    P63104                YWHAZ        1   \n",
       "3      P06493              CDK1    P11171                EPB41        1   \n",
       "4      P06493              CDK1    P11171                EPB41  1;2;5;7   \n",
       "...       ...               ...       ...                  ...      ...   \n",
       "41421  P29597              TYK2    P51692               STAT5B        1   \n",
       "41422  Q06418             TYRO3    P19174                PLCG1      1;2   \n",
       "41423  Q9H4A3              WNK1    Q8TAX0                 OSR1        1   \n",
       "41424  Q9H4A3              WNK1    Q96J92                 WNK4      1;3   \n",
       "41425  Q9NYL2           MAP3K20    Q92903                 CDS1        1   \n",
       "\n",
       "      residue_type  residue_offset     modification  \\\n",
       "0                S              58  phosphorylation   \n",
       "1                S             184  phosphorylation   \n",
       "2                S             184  phosphorylation   \n",
       "3                S             712  phosphorylation   \n",
       "4                T              60  phosphorylation   \n",
       "...            ...             ...              ...   \n",
       "41421            Y             699  phosphorylation   \n",
       "41422            Y             771  phosphorylation   \n",
       "41423            T             185  phosphorylation   \n",
       "41424            S             335  phosphorylation   \n",
       "41425            T              68  phosphorylation   \n",
       "\n",
       "                                                 sources  \\\n",
       "0      HPRD;HPRD_MIMP;KEA;MIMP;PhosphoSite;PhosphoSit...   \n",
       "1      HPRD;HPRD_MIMP;KEA;MIMP;PhosphoSite_MIMP;phosp...   \n",
       "2      HPRD;HPRD_MIMP;KEA;MIMP;PhosphoNetworks;Phosph...   \n",
       "3      HPRD_MIMP;MIMP;PhosphoSite_MIMP;ProtMapper;REA...   \n",
       "4      MIMP;PhosphoSite;PhosphoSite_MIMP;ProtMapper;R...   \n",
       "...                                                  ...   \n",
       "41421                                                KEA   \n",
       "41422                                                KEA   \n",
       "41423                                                KEA   \n",
       "41424                                                KEA   \n",
       "41425                                                KEA   \n",
       "\n",
       "                                              references  curation_effort  \n",
       "0      HPRD:11956222;KEA:11956222;KEA:12861023;KEA:16...               11  \n",
       "1                HPRD:11956222;KEA:11956222;KEA:15071501                3  \n",
       "2      HPRD:15696159;KEA:11956222;KEA:15071501;KEA:15...                9  \n",
       "3      ProtMapper:15525677;dbPTM:15525677;dbPTM:18220...                5  \n",
       "4       ProtMapper:15525677;dbPTM:15525677;dbPTM:2171679                3  \n",
       "...                                                  ...              ...  \n",
       "41421             KEA:10830280;KEA:11751923;KEA:12411494                3  \n",
       "41422  KEA:12601080;KEA:15144186;KEA:15592455;KEA:160...                8  \n",
       "41423                                       KEA:18270262                1  \n",
       "41424                                       KEA:15883153                1  \n",
       "41425                                       KEA:10973490                1  \n",
       "\n",
       "[41426 rows x 11 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.make_df()\n",
    "es.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Protein sequences\n",
    "\n",
    "The APIs for sequences are very basic, because we've never really needed them; but the fundamentals are probably there to make a nice, powerful API. Still, I don't believe *pypath* will ever be strong in sequences, it's just not our main topic."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T19:40:09.732079Z",
     "start_time": "2022-12-02T19:40:09.295287Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 19, 'TFIIRCLQWT')"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.utils import homology\n",
    "seqc = homology.SequenceContainer(preload_seq = [9606])\n",
    "akt1 = seqc.get_seq('P31749')\n",
    "akt1.get_region(start = 10, end = 19, isoform = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T19:44:52.083650Z",
     "start_time": "2022-12-02T19:44:51.992066Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'P63120': <pypath.utils.seq.Seq at 0x689900d45cc0>,\n",
       " 'Q96EC8': <pypath.utils.seq.Seq at 0x689908ea8f70>,\n",
       " 'Q6ZMS4': <pypath.utils.seq.Seq at 0x689908eaa4a0>,\n",
       " 'Q8N8L2': <pypath.utils.seq.Seq at 0x6899223538b0>,\n",
       " 'Q15916': <pypath.utils.seq.Seq at 0x689922353c70>,\n",
       " 'O60384': <pypath.utils.seq.Seq at 0x689922350730>,\n",
       " 'Q3MIS6': <pypath.utils.seq.Seq at 0x689922353310>,\n",
       " 'Q86UK7': <pypath.utils.seq.Seq at 0x689922353760>,\n",
       " 'Q6P280': <pypath.utils.seq.Seq at 0x689922353190>,\n",
       " 'Q969W1': <pypath.utils.seq.Seq at 0x689922350d90>,\n",
       " 'O14978': <pypath.utils.seq.Seq at 0x689922353220>,\n",
       " 'P61129': <pypath.utils.seq.Seq at 0x689922353370>,\n",
       " 'Q66K41': <pypath.utils.seq.Seq at 0x6899223534f0>,\n",
       " 'Q15937': <pypath.utils.seq.Seq at 0x689922350c70>,\n",
       " 'Q9P2J8': <pypath.utils.seq.Seq at 0x689922351450>,\n",
       " 'Q8ND82': <pypath.utils.seq.Seq at 0x689922353910>,\n",
       " 'Q9NP64': <pypath.utils.seq.Seq at 0x6899223502b0>,\n",
       " 'P98182': <pypath.utils.seq.Seq at 0x689922350280>,\n",
       " 'Q8IUH4': <pypath.utils.seq.Seq at 0x68992235"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 53045 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pypath.utils import seq\n",
    "human_proteome = seq.swissprot_seq()\n",
    "human_proteome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T19:48:41.267941Z",
     "start_time": "2022-12-02T19:48:41.261611Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[SeqLookup(isoform=1, offset=625)]"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(human_proteome['P00533'].findall('YGCT'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Annotations <a class=\"anchor\" id=\"annotations\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This database provides various annotations about the function, structure, localization and many other properties of the proteins and genes. The database is an instance of `pypath.core.annot.AnnotationTable` class. The database is built with the default or current configuration by the `core.annot.get_db` method.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> it is recommended to access databases <a href=\"#an-dbmanager\">by the manager</a>. Running the code below takes really long and does not save or reload the database, it builds a fresh copy each time.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:08.083875Z",
     "start_time": "2022-12-02T15:07:07.502326Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Annotation database: 3788067 records about 51636 entities from 78 resources>"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import annot\n",
    "an = annot.get_db()\n",
    "an"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load a single annotation resource\n",
    "\n",
    "The annotations database is huge, on disk it takes up 1-2 GB of space, it consists of 60-70 resources. But all these resources are not integrated with each other, each can be loaded individually, by their dedicated classes in the `core.annot` module. This practice can be recommended and will be supported better in the future. Let's load one resource:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:38:57.042470Z",
     "start_time": "2022-12-03T15:38:08.786534Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<CPAD annotations: 2308 records about 1358 entities>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import annot\n",
    "cpad = annot.Cpad()\n",
    "cpad"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The resulted object is derived from the `AnnotationBase` class, its data is stored under the `annot` attribute, in a dict where identifiers are keys and sets of annotation records are the values. The keys of the records are shown by the `get_names` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:06:45.366916Z",
     "start_time": "2022-12-02T15:06:45.357658Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('regulator_type',\n",
       " 'effect_on_pathway',\n",
       " 'pathway',\n",
       " 'effect_on_cancer',\n",
       " 'effect_on_cancer_outcome',\n",
       " 'cancer',\n",
       " 'pathway_category')"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cpad.get_names()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For each name we can list the possible values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:06:47.363402Z",
     "start_time": "2022-12-02T15:06:47.355610Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Acute lymphoblastic leukemia (ALL) (precursor T lymphoblastic leukemia)',\n",
       " 'Acute myeloid leukemia (AML)',\n",
       " 'Basal cell carcinoma',\n",
       " 'Bladder cancer',\n",
       " 'Breast cancer',\n",
       " 'Cervical cancer',\n",
       " 'Cholangiocarcinoma',\n",
       " 'Choriocarcinoma',\n",
       " 'Chronic lymphocytic leukemia (CLL)',\n",
       " 'Chronic myeloid leukemia (CML)',\n",
       " 'Colorectal cancer',\n",
       " 'Endometrial cancer',\n",
       " 'Esophageal cancer',\n",
       " \"Ewing's sarcoma\",\n",
       " 'Gallbladder cancer',\n",
       " 'Gastric cancer',\n",
       " 'Glioma',\n",
       " 'Hepatocellular carcinoma',\n",
       " 'Hodgkin lymphoma',\n",
       " 'Infantile hemangioma',\n",
       " 'Laryngeal cancer',\n",
       " 'Malignant melanoma',\n",
       " 'Malignant pleural mesothelioma',\n",
       " 'Mantle cell lymphoma',\n",
       " 'Multiple myeloma',\n",
       " 'Nasopharyngeal cancer',\n",
       " 'Neuroblastoma',\n",
       " 'Non-small cell lung cancer',\n",
       " 'Oral cancer',\n",
       " 'Osteosarcoma',\n",
       " 'Ovarian cancer',\n",
       " 'Pancreatic cancer',\n",
       " 'Pituitary adenomas',\n",
       " 'Prostate cancer',\n",
       " 'Renal cell carcinoma',\n",
       " 'Small cell lung cancer',\n",
       " 'Squamous cell carcinoma',\n",
       " 'Synovial sarcoma',\n",
       " 'Testicular cancer',\n",
       " 'Thyroid cancer'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cpad.get_values('cancer')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on their annotations the `select` method filters the annotated molecules. For example, 78 complexes, miRNAs and proteins are annotated as inhibiting colorectal cancer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:06:50.326964Z",
     "start_time": "2022-12-02T15:06:50.318535Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'A6NDV4',\n",
       " Complex: COMPLEX:O14745,\n",
       " Complex: COMPLEX:O14862,\n",
       " Complex: COMPLEX:O15169_P25054,\n",
       " Complex: COMPLEX:O94813,\n",
       " Complex: COMPLEX:O94953,\n",
       " Complex: COMPLEX:P00533,\n",
       " Complex: COMPLEX:P06733,\n",
       " Complex Glucose transporter complex 1: COMPLEX:P11166,\n",
       " Complex: COMPLEX:P25054,\n",
       " Complex: COMPLEX:P40261,\n",
       " Complex: COMPLEX:P49327,\n",
       " Complex: COMPLEX:P54687,\n",
       " Complex PTEN phosphatase complex: COMPLEX:P60484,\n",
       " Complex: COMPLEX:Q01973,\n",
       " Complex: COMPLEX:Q12888,\n",
       " Complex: COMPLEX:Q13620,\n",
       " Complex: COMPLEX:Q96CX2,\n",
       " Complex: COMPLEX:Q99558,\n",
       " 'MIMAT0000069',\n",
       " 'MIMAT0000089',\n",
       " 'MIMAT0000093',\n",
       " 'MIMAT0000262',\n",
       " 'MIMAT0000274',\n",
       " 'MIMAT0000422',\n",
       " 'MIMAT0000427',\n",
       " 'MIMAT0000437',\n",
       " 'MIMAT0000449',\n",
       " 'MIMAT0000455',\n",
       " 'MIMAT0000460',\n",
       " 'MIMAT0000461',\n",
       " 'MIMAT0000617',\n",
       " 'MIMAT0003266',\n",
       " 'MIMAT0003320',\n",
       " 'O14745',\n",
       " 'O14862',\n",
       " 'O15169',\n",
       " 'O75473',\n",
       " 'O75888',\n",
       " 'O76041',\n",
       " 'O94813',\n",
       " 'O94953',\n",
       " 'P00533',\n",
       " 'P06733',\n",
       " 'P06756',\n",
       " 'P11166',\n",
       " 'P13631',\n",
       " 'P22676',\n",
       " 'P25054',\n",
       " 'P25791',\n",
       " 'P40261',\n",
       " 'P49327',\n",
       " 'P546"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1279 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cpad.select(cancer = 'Colorectal cancer', effect_on_cancer = 'Inhibiting')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the full annotations database by the database manager\n",
    "\n",
    "Alternatively, the full annotations database can be accessed in the usual way:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Annotation database: 5490653 records about 50872 entities from 68 resources>"
      ]
     },
     "execution_count": 215,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "an = omnipath.db.get_db('annotations')\n",
    "an"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `AnnotationTable` object contains the resource specific annotation objects under the `annots` attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:39.944846Z",
     "start_time": "2022-12-02T15:07:39.732216Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CellTypist': <CellTypist annotations: 927 records about 473 entities>,\n",
       " 'Integrins': <Integrins annotations: 62 records about 62 entities>,\n",
       " 'CellCellInteractions': <CellCellInteractions annotations: 5544 records about 4960 entities>,\n",
       " 'PanglaoDB': <PanglaoDB annotations: 8479 records about 4813 entities>,\n",
       " 'Lambert2018': <Lambert2018 annotations: 3281 records about 3277 entities>,\n",
       " 'CancerSEA': <CancerSEA annotations: 2515 records about 1992 entities>,\n",
       " 'Phobius': <Phobius annotations: 35382 records about 35382 entities>,\n",
       " 'GO_Intercell': <GO_Intercell annotations: 48799 records about 18377 entities>,\n",
       " 'MatrixDB': <MatrixDB annotations: 18127 records about 15903 entities>,\n",
       " 'Surfaceome': <Surfaceome annotations: 3558 records about 3558 entities>,\n",
       " 'Matrisome': <Matrisome annotations: 1514 records about 1514 entities>,\n",
       " 'HPA_secretome': <HPA_secretome annotations: 3568 records about 3568 entities>,\n",
       " 'HPMR': <HPMR annotations: 1748 records about 1695 entities>,\n",
       " 'CPAD': <CPAD annotati"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 5842 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "an.annots"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For each of these you can query the names of the fields, their possible values and the set of proteins annotated with any combination of the values, just like for CPAD above. As another exemple, let's take a look into the Matrisome database:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:45.418110Z",
     "start_time": "2022-12-02T15:07:45.411857Z"
    }
   },
   "outputs": [],
   "source": [
    "matrisome = an.annots['Matrisome']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:49.460328Z",
     "start_time": "2022-12-02T15:07:49.450474Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('mainclass', 'subclass', 'subsubclass')"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrisome.get_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:53.409272Z",
     "start_time": "2022-12-02T15:07:53.403049Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Collagens',\n",
       " 'ECM Glycoproteins',\n",
       " 'ECM Regulators',\n",
       " 'ECM-affiliated Proteins',\n",
       " 'Proteoglycans',\n",
       " 'Secreted Factors',\n",
       " 'n/a'}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrisome.get_values('subclass')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:07:56.776774Z",
     "start_time": "2022-12-02T15:07:56.762585Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'A6NMZ7',\n",
       " 'A8TX70',\n",
       " 'B4DZ39',\n",
       " Complex Collagen type I homotrimer: COMPLEX:P02452,\n",
       " Complex HT_DM_Cluster278: COMPLEX:P02452_P02462_P08572_P29400_P53420_Q01955_Q02388_Q14031_Q17RW2_Q8NFW1,\n",
       " Complex Collagen type I trimer: COMPLEX:P02452_P08123,\n",
       " Complex Collagen type II trimer: COMPLEX:P02458,\n",
       " Complex Collagen type XI trimer variant 1: COMPLEX:P02458_P12107_P13942,\n",
       " Complex: COMPLEX:P02458_P20908_P25067,\n",
       " Complex: COMPLEX:P02458_P20908_P25067_P29400,\n",
       " Complex: COMPLEX:P02458_P25067_P29400,\n",
       " Complex Collagen type III trimer: COMPLEX:P02461,\n",
       " Complex: COMPLEX:P02462,\n",
       " Complex Collagen type IV trimer variant 1: COMPLEX:P02462_P08572,\n",
       " Complex Collagen type XI trimer variant 2: COMPLEX:P05997_P12107,\n",
       " Complex Collagen type XI trimer variant 3: COMPLEX:P05997_P12107_P20908,\n",
       " Complex Collagen type V trimer variant 1: COMPLEX:P05997_P20908,\n",
       " Complex Collagen type V trimer variant 2: COMPLEX:P05997_P20908_P25940,\n",
       " Complex: COMPLEX:P08572,\n",
       " Complex: COMPLEX:P12109_P12110,\n",
       " Complex Collagen "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 3072 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "matrisome.get_subset(subclass = 'Collagens')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load only selected annotations\n",
    "\n",
    "Another option is to load only certain annotation resources into an `AnnotationTable` object. We refer to the resources by class names. For example, if you only want to load the pathway membership annotations from SIGNOR, SignaLink, NetPath and KEGG, you can provide the names of the appropriate classes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:09:48.261323Z",
     "start_time": "2022-12-02T15:09:36.190127Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Annotation database: 28745 records about 6762 entities from 4 resources>"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pathways = annot.AnnotationTable(\n",
    "    protein_sources = (\n",
    "        'SignalinkPathways',\n",
    "        'KeggPathways',\n",
    "        'NetpathPathways',\n",
    "        'SignorPathways',\n",
    "    ),\n",
    "    complex_sources = (),\n",
    ")\n",
    "pathways"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `AnnotationTable` object provides methods to query all resources together, or build a boolean array out of them. To see all annotations of one protein:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:10:17.758769Z",
     "start_time": "2022-12-02T15:10:17.754186Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[SignalinkPathway(pathway='Receptor tyrosine kinase'),\n",
       " SignalinkPathway(pathway='JAK/STAT'),\n",
       " KeggPathway(pathway='Proteoglycans in cancer'),\n",
       " KeggPathway(pathway='Regulation of actin cytoskeleton'),\n",
       " KeggPathway(pathway='Oxytocin signaling pathway'),\n",
       " KeggPathway(pathway='Phospholipase D signaling pathway'),\n",
       " KeggPathway(pathway='Pathways in cancer'),\n",
       " KeggPathway(pathway='Hepatocellular carcinoma'),\n",
       " KeggPathway(pathway='Colorectal cancer'),\n",
       " KeggPathway(pathway='Melanoma'),\n",
       " KeggPathway(pathway='EGFR tyrosine kinase inhibitor resistance'),\n",
       " KeggPathway(pathway='Human papillomavirus infection'),\n",
       " KeggPathway(pathway='Pancreatic cancer'),\n",
       " KeggPathway(pathway='Non-small cell lung cancer'),\n",
       " KeggPathway(pathway='Central carbon metabolism in cancer'),\n",
       " KeggPathway(pathway='Endocytosis'),\n",
       " KeggPathway(pathway='Endometrial cancer'),\n",
       " KeggPathway(pathway='Choline metabolism in cancer'),\n",
       " KeggPathway(pathway='Bladder cancer'),\n",
       " KeggPathway(pathway='Parathyroid hormone synthesis, secretion "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 2540 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pathways.all_annotations('P00533')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data frames of annotations\n",
    "\n",
    "Data from annotation objects can be exported to a `pandas.DataFrame`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:40:14.297643Z",
     "start_time": "2022-12-03T15:40:14.201860Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uniprot</th>\n",
       "      <th>genesymbol</th>\n",
       "      <th>entity_type</th>\n",
       "      <th>source</th>\n",
       "      <th>label</th>\n",
       "      <th>value</th>\n",
       "      <th>record_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>regulator_type</td>\n",
       "      <td>protein</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>effect_on_pathway</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>pathway</td>\n",
       "      <td>Actin cytoskeleton pathway</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>effect_on_cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>effect_on_cancer_outcome</td>\n",
       "      <td>inhibit glioma cell migration</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14396</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>cancer</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>2306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14397</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>effect_on_pathway</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>2307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14398</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>pathway</td>\n",
       "      <td>ERK signaling pathway</td>\n",
       "      <td>2307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14399</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>effect_on_cancer</td>\n",
       "      <td>Activating</td>\n",
       "      <td>2307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14400</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>CPAD</td>\n",
       "      <td>cancer</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>2307</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14401 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              uniprot   genesymbol entity_type source  \\\n",
       "0              Q16181        SEPT7     protein   CPAD   \n",
       "1              Q16181        SEPT7     protein   CPAD   \n",
       "2              Q16181        SEPT7     protein   CPAD   \n",
       "3              Q16181        SEPT7     protein   CPAD   \n",
       "4              Q16181        SEPT7     protein   CPAD   \n",
       "...               ...          ...         ...    ...   \n",
       "14396  COMPLEX:P30990  COMPLEX:NTS     complex   CPAD   \n",
       "14397  COMPLEX:P30990  COMPLEX:NTS     complex   CPAD   \n",
       "14398  COMPLEX:P30990  COMPLEX:NTS     complex   CPAD   \n",
       "14399  COMPLEX:P30990  COMPLEX:NTS     complex   CPAD   \n",
       "14400  COMPLEX:P30990  COMPLEX:NTS     complex   CPAD   \n",
       "\n",
       "                          label                          value  record_id  \n",
       "0                regulator_type                        protein          0  \n",
       "1             effect_on_pathway                   Upregulation          0  \n",
       "2                       pathway     Actin cytoskeleton pathway          0  \n",
       "3              effect_on_cancer                     Inhibiting          0  \n",
       "4      effect_on_cancer_outcome  inhibit glioma cell migration          0  \n",
       "...                         ...                            ...        ...  \n",
       "14396                    cancer       Hepatocellular carcinoma       2306  \n",
       "14397         effect_on_pathway                   Upregulation       2307  \n",
       "14398                   pathway          ERK signaling pathway       2307  \n",
       "14399          effect_on_cancer                     Activating       2307  \n",
       "14400                    cancer                 Gastric cancer       2307  \n",
       "\n",
       "[14401 rows x 7 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cpad.make_df()\n",
    "cpad.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The data frame has a long format. It can be converted to the more conventional wide format using standard `pandas` procedures (well, in tidyverse you would simply call `tidyr::pivot_wider`, in `pandas` you have to do an unintuitive sequence of 6 calls):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:40:19.182603Z",
     "start_time": "2022-12-03T15:40:19.153975Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>label</th>\n",
       "      <th>uniprot</th>\n",
       "      <th>genesymbol</th>\n",
       "      <th>entity_type</th>\n",
       "      <th>cancer</th>\n",
       "      <th>effect_on_cancer</th>\n",
       "      <th>effect_on_cancer_outcome</th>\n",
       "      <th>effect_on_pathway</th>\n",
       "      <th>pathway</th>\n",
       "      <th>pathway_category</th>\n",
       "      <th>regulator_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q16181</td>\n",
       "      <td>SEPT7</td>\n",
       "      <td>protein</td>\n",
       "      <td>Glioma</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>inhibit glioma cell migration</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>Actin cytoskeleton pathway</td>\n",
       "      <td>Regulation of actin cytoskeleton</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MIMAT0000431</td>\n",
       "      <td>hsa-miR-140</td>\n",
       "      <td>mirna</td>\n",
       "      <td>Squamous cell carcinoma</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>suppress tumor cell migration and invasion</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>ADAM10 mediated Notch1 signaling pathway</td>\n",
       "      <td>Notch signaling pathway</td>\n",
       "      <td>mirna</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>MIMAT0005886</td>\n",
       "      <td>hsa-miR-1297</td>\n",
       "      <td>mirna</td>\n",
       "      <td>Prostate cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>inhibit proliferation and invasion</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>AEG1/Wnt signaling pathway</td>\n",
       "      <td>Wnt signaling pathway</td>\n",
       "      <td>mirna</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Q9UP65</td>\n",
       "      <td>PLA2G4C</td>\n",
       "      <td>protein</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>inhibit EGF-induced chemotaxis</td>\n",
       "      <td>Downregulation</td>\n",
       "      <td>Akt signaling pathway</td>\n",
       "      <td>PI3K-Akt signaling pathway</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Q92600</td>\n",
       "      <td>CNOT9</td>\n",
       "      <td>protein</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>suppress cell proliferation</td>\n",
       "      <td>Downregulation</td>\n",
       "      <td>Akt signaling pathway</td>\n",
       "      <td>PI3K-Akt signaling pathway</td>\n",
       "      <td>protein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2303</th>\n",
       "      <td>COMPLEX:P16422</td>\n",
       "      <td>COMPLEX:EPCAM</td>\n",
       "      <td>complex</td>\n",
       "      <td>Prostate cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Downregulation</td>\n",
       "      <td>PI3K-Akt-mTOR signaling pathway</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2304</th>\n",
       "      <td>COMPLEX:Q9Y6Y0</td>\n",
       "      <td>COMPLEX:IVNS1ABP</td>\n",
       "      <td>complex</td>\n",
       "      <td>Prostate cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>Akt signaling pathway</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2305</th>\n",
       "      <td>COMPLEX:Q96CX2</td>\n",
       "      <td>COMPLEX:KCTD12</td>\n",
       "      <td>complex</td>\n",
       "      <td>Colorectal cancer</td>\n",
       "      <td>Inhibiting</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>ERK signaling pathway</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2306</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>Activating</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>Wnt/beta-catenin signaling pathway</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2307</th>\n",
       "      <td>COMPLEX:P30990</td>\n",
       "      <td>COMPLEX:NTS</td>\n",
       "      <td>complex</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>Activating</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Upregulation</td>\n",
       "      <td>ERK signaling pathway</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2308 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "label         uniprot        genesymbol entity_type                    cancer  \\\n",
       "0              Q16181             SEPT7     protein                    Glioma   \n",
       "1        MIMAT0000431       hsa-miR-140       mirna   Squamous cell carcinoma   \n",
       "2        MIMAT0005886      hsa-miR-1297       mirna           Prostate cancer   \n",
       "3              Q9UP65           PLA2G4C     protein             Breast cancer   \n",
       "4              Q92600             CNOT9     protein             Breast cancer   \n",
       "...               ...               ...         ...                       ...   \n",
       "2303   COMPLEX:P16422     COMPLEX:EPCAM     complex           Prostate cancer   \n",
       "2304   COMPLEX:Q9Y6Y0  COMPLEX:IVNS1ABP     complex           Prostate cancer   \n",
       "2305   COMPLEX:Q96CX2    COMPLEX:KCTD12     complex         Colorectal cancer   \n",
       "2306   COMPLEX:P30990       COMPLEX:NTS     complex  Hepatocellular carcinoma   \n",
       "2307   COMPLEX:P30990       COMPLEX:NTS     complex            Gastric cancer   \n",
       "\n",
       "label effect_on_cancer                    effect_on_cancer_outcome  \\\n",
       "0           Inhibiting               inhibit glioma cell migration   \n",
       "1           Inhibiting  suppress tumor cell migration and invasion   \n",
       "2           Inhibiting          inhibit proliferation and invasion   \n",
       "3           Inhibiting              inhibit EGF-induced chemotaxis   \n",
       "4           Inhibiting                 suppress cell proliferation   \n",
       "...                ...                                         ...   \n",
       "2303        Inhibiting                                         NaN   \n",
       "2304        Inhibiting                                         NaN   \n",
       "2305        Inhibiting                                         NaN   \n",
       "2306        Activating                                         NaN   \n",
       "2307        Activating                                         NaN   \n",
       "\n",
       "label effect_on_pathway                                   pathway  \\\n",
       "0          Upregulation                Actin cytoskeleton pathway   \n",
       "1          Upregulation  ADAM10 mediated Notch1 signaling pathway   \n",
       "2          Upregulation                AEG1/Wnt signaling pathway   \n",
       "3        Downregulation                     Akt signaling pathway   \n",
       "4        Downregulation                     Akt signaling pathway   \n",
       "...                 ...                                       ...   \n",
       "2303     Downregulation           PI3K-Akt-mTOR signaling pathway   \n",
       "2304       Upregulation                     Akt signaling pathway   \n",
       "2305       Upregulation                     ERK signaling pathway   \n",
       "2306       Upregulation        Wnt/beta-catenin signaling pathway   \n",
       "2307       Upregulation                     ERK signaling pathway   \n",
       "\n",
       "label                  pathway_category regulator_type  \n",
       "0      Regulation of actin cytoskeleton        protein  \n",
       "1               Notch signaling pathway          mirna  \n",
       "2                 Wnt signaling pathway          mirna  \n",
       "3            PI3K-Akt signaling pathway        protein  \n",
       "4            PI3K-Akt signaling pathway        protein  \n",
       "...                                 ...            ...  \n",
       "2303                                NaN            NaN  \n",
       "2304                                NaN            NaN  \n",
       "2305                                NaN            NaN  \n",
       "2306                                NaN            NaN  \n",
       "2307                                NaN            NaN  \n",
       "\n",
       "[2308 rows x 10 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_cols = ['record_id', 'uniprot', 'genesymbol', 'label', 'entity_type']\n",
    "\n",
    "(\n",
    "    cpad.df.drop('source', axis=1).\n",
    "    set_index(index_cols).\n",
    "    unstack('label').\n",
    "    droplevel(axis=1, level=0).\n",
    "    reset_index().\n",
    "    drop('record_id', axis=1)\n",
    ")   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inter-cellular signaling roles <a class=\"anchor\" id=\"intercell\"></a>\n",
    "\n",
    "`pypath` does not combine the annotations in the `annot` module, exactly what goes in goes out. For example, WNT pathway from Signor and SignaLink won't be merged automatically. However with the `pypath.core.annot.CustomAnnotation` class anyone can do it. For inter-cellular communication categories the `pypath.core.intercell` module combines the data from all the relevant resources and creates categories based on a combination of evidences. The database is an instance of the `IntercellAnnotation` object, and the build is executed by the `pypath.core.intercell.get_db` function.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> it is recommended to access databases <a href=\"#ic-dbmanager\">by the manager</a>. Running the code below takes really long and does not save or reload the database, it builds a fresh copy each time.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:13:03.593954Z",
     "start_time": "2022-12-02T15:13:03.336236Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Intercell annotations: 310033 records about 43617 entities>"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import intercell\n",
    "ic = intercell.get_db() # this takes quite some time\n",
    "                       # unless you load annotations from a pickle cache\n",
    "ic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:43:27.834408Z",
     "start_time": "2022-12-03T15:40:32.368198Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Intercell annotations: 301527 records about 48570 entities>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath import omnipath\n",
    "ic = omnipath.db.get_db('intercell')\n",
    "ic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This object stores its data under the `classes` attribute. Classes are defined in `pypath.core.intercell_annot.annot_combined_classes`. In addition, we manually revised and excluded some proteins from the more generic classes, these are listed in `pypath.core.intercell_annot.excludes`. Each class has the following properties:\n",
    "\n",
    "- `name`: all lowercase, human understandable name, without repeating the parent class (e.g. *WNT receptor*s will be simply *wnt*, and the parent class will be *receptor*)\n",
    "- `parent`: for a *specific* class the parent is the *generic* category it belongs to; for generic classes the `name` and `parent` are the same\n",
    "- `resource`: the resource the data comes from, or *OmniPath* for composite classes (combined from multiple resources)\n",
    "- `scope`: *specific* or *generic*; e.g. *TGF ligand* is specific, *ligand* is generic\n",
    "- `aspect`: *locational* (e.g. *plasma membrane*) or *functional* (e.g. *transporter*)\n",
    "\n",
    "Read more about the design of the intercell database [in our paper](https://www.embopress.org/doi/full/10.15252/msb.20209923)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:16:54.631309Z",
     "start_time": "2022-12-02T15:16:54.600885Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{AnnotDefKey(name='transmembrane', parent='transmembrane', resource='UniProt_location'): <AnnotationGroup `transmembrane` from UniProt_location, 5150 elements>,\n",
       " AnnotDefKey(name='transmembrane', parent='transmembrane', resource='UniProt_topology'): <AnnotationGroup `transmembrane` from UniProt_topology, 5760 elements>,\n",
       " AnnotDefKey(name='transmembrane', parent='transmembrane', resource='UniProt_keyword'): <AnnotationGroup `transmembrane` from UniProt_keyword, 7041 elements>,\n",
       " AnnotDefKey(name='transmembrane', parent='transmembrane_predicted', resource='Phobius'): <AnnotationGroup `transmembrane` from Phobius, 6444 elements>,\n",
       " AnnotDefKey(name='transmembrane_phobius', parent='transmembrane_predicted', resource='Almen2009'): <AnnotationGroup `transmembrane_phobius` from Almen2009, 2072 elements>,\n",
       " AnnotDefKey(name='transmembrane_sosui', parent='transmembrane_predicted', resource='Almen2009'): <AnnotationGroup `transmembrane_sosui` from Almen2009, 1663 elements>,\n",
       " AnnotDefKey(name='trans"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 143945 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ic.classes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An easy way to access the classes is the `select` method. The `AnnotationGroup` objects behave as plain Python `set`s, and besides that, they feature many further attributes and methods."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:17:00.542493Z",
     "start_time": "2022-12-02T15:17:00.536417Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AnnotationGroup `gaba` from HGNC, 40 elements>"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gaba_receptors = ic.select('gaba', parent = 'receptor')\n",
    "gaba_receptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:17:02.764333Z",
     "start_time": "2022-12-02T15:17:02.754080Z"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'A8MPY1',\n",
       " Complex GABA-A receptor (GABRA1, GABRB2, GABRD): COMPLEX:O14764_P14867_P47870,\n",
       " Complex GABA-A receptor, alpha-4/beta-3/delta: COMPLEX:O14764_P28472_P48169,\n",
       " Complex GABA-A receptor, alpha-6/beta-3/delta: COMPLEX:O14764_P28472_Q16445,\n",
       " Complex GABA-A receptor, alpha-4/beta-2/delta: COMPLEX:O14764_P47870_P48169,\n",
       " Complex GABA-A receptor, alpha-6/beta-2/delta: COMPLEX:O14764_P47870_Q16445,\n",
       " Complex GABBR1-GABBR2 complex: COMPLEX:O75899_Q9UBS5,\n",
       " Complex: COMPLEX:P14867,\n",
       " Complex GABA-A receptor, alpha-1/beta-3/gamma-2: COMPLEX:P14867_P18507_P28472,\n",
       " Complex GABA-A receptor (GABRA1, GABRB2, GABRG2): COMPLEX:P14867_P18507_P47870,\n",
       " Complex GABA-A receptor, alpha-5/beta-3/gamma-2: COMPLEX:P18507_P28472_P31644,\n",
       " Complex GABA-A receptor, alpha-3/beta-3/gamma-2: COMPLEX:P18507_P28472_P34903,\n",
       " Complex GABA-A receptor, alpha-2/beta-3/gamma-2: COMPLEX:P18507_P28472_P47869,\n",
       " Complex GABA-A receptor, alpha-6/beta-3/gamma-2: COMPLEX:P18507_P28472_Q16445,\n",
       " Complex: COMPLEX:P18507_Q8N1C3,\n",
       " C"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1368 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "gaba_receptors.members"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build an intercellular communication network\n",
    "\n",
    "The intercell database can be connected to a `Network` object to create an intercellular communication network:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:17:08.247230Z",
     "start_time": "2022-12-02T15:17:08.243923Z"
    }
   },
   "outputs": [],
   "source": [
    "cu = omnipath.db.get_db('curated')\n",
    "ic.register_network(cu)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Quantitative overview of intercell annotations\n",
    "\n",
    "A data frame with basic statistics is available:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:45:17.978114Z",
     "start_time": "2022-12-03T15:45:17.681491Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>parent</th>\n",
       "      <th>database</th>\n",
       "      <th>scope</th>\n",
       "      <th>aspect</th>\n",
       "      <th>source</th>\n",
       "      <th>consensus_score</th>\n",
       "      <th>transmitter</th>\n",
       "      <th>receiver</th>\n",
       "      <th>secreted</th>\n",
       "      <th>plasma_membrane_transmembrane</th>\n",
       "      <th>plasma_membrane_peripheral</th>\n",
       "      <th>n_uniprot</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>5150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_topology</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>5760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_keyword</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>7041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane_predicted</td>\n",
       "      <td>Phobius</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>6444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>transmembrane_phobius</td>\n",
       "      <td>transmembrane_predicted</td>\n",
       "      <td>Almen2009</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>2072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1120</th>\n",
       "      <td>parin_adhesion_regulator</td>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>HGNC</td>\n",
       "      <td>specific</td>\n",
       "      <td>functional</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1121</th>\n",
       "      <td>plakophilin_adhesion_regulator</td>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>HGNC</td>\n",
       "      <td>specific</td>\n",
       "      <td>functional</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1122</th>\n",
       "      <td>actin_regulation_adhesome</td>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>Adhesome</td>\n",
       "      <td>specific</td>\n",
       "      <td>functional</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1123</th>\n",
       "      <td>adhesion_cytoskeleton_adaptor</td>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>Adhesome</td>\n",
       "      <td>specific</td>\n",
       "      <td>functional</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1124</th>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>intracellular_intercellular_related</td>\n",
       "      <td>OmniPath</td>\n",
       "      <td>generic</td>\n",
       "      <td>functional</td>\n",
       "      <td>composite</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>291</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1125 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 category  \\\n",
       "0                           transmembrane   \n",
       "1                           transmembrane   \n",
       "2                           transmembrane   \n",
       "3                           transmembrane   \n",
       "4                   transmembrane_phobius   \n",
       "...                                   ...   \n",
       "1120             parin_adhesion_regulator   \n",
       "1121       plakophilin_adhesion_regulator   \n",
       "1122            actin_regulation_adhesome   \n",
       "1123        adhesion_cytoskeleton_adaptor   \n",
       "1124  intracellular_intercellular_related   \n",
       "\n",
       "                                   parent          database     scope  \\\n",
       "0                           transmembrane  UniProt_location   generic   \n",
       "1                           transmembrane  UniProt_topology   generic   \n",
       "2                           transmembrane   UniProt_keyword   generic   \n",
       "3                 transmembrane_predicted           Phobius   generic   \n",
       "4                 transmembrane_predicted         Almen2009   generic   \n",
       "...                                   ...               ...       ...   \n",
       "1120  intracellular_intercellular_related              HGNC  specific   \n",
       "1121  intracellular_intercellular_related              HGNC  specific   \n",
       "1122  intracellular_intercellular_related          Adhesome  specific   \n",
       "1123  intracellular_intercellular_related          Adhesome  specific   \n",
       "1124  intracellular_intercellular_related          OmniPath   generic   \n",
       "\n",
       "          aspect             source  consensus_score  transmitter  receiver  \\\n",
       "0     locational  resource_specific                6        False     False   \n",
       "1     locational  resource_specific                6        False     False   \n",
       "2     locational  resource_specific                1        False     False   \n",
       "3     locational  resource_specific                1        False     False   \n",
       "4     locational  resource_specific                0        False     False   \n",
       "...          ...                ...              ...          ...       ...   \n",
       "1120  functional  resource_specific                0         True     False   \n",
       "1121  functional  resource_specific                0         True     False   \n",
       "1122  functional  resource_specific                0         True     False   \n",
       "1123  functional  resource_specific                0         True     False   \n",
       "1124  functional          composite                0         True     False   \n",
       "\n",
       "      secreted  plasma_membrane_transmembrane  plasma_membrane_peripheral  \\\n",
       "0        False                           True                       False   \n",
       "1        False                           True                       False   \n",
       "2        False                          False                       False   \n",
       "3        False                          False                       False   \n",
       "4        False                           True                       False   \n",
       "...        ...                            ...                         ...   \n",
       "1120     False                          False                       False   \n",
       "1121     False                          False                       False   \n",
       "1122     False                          False                       False   \n",
       "1123     False                          False                       False   \n",
       "1124     False                          False                       False   \n",
       "\n",
       "      n_uniprot  \n",
       "0          5150  \n",
       "1          5760  \n",
       "2          7041  \n",
       "3          6444  \n",
       "4          2072  \n",
       "...         ...  \n",
       "1120          5  \n",
       "1121          3  \n",
       "1122         22  \n",
       "1123        118  \n",
       "1124        291  \n",
       "\n",
       "[1125 rows x 13 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ic.counts_df()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Intercell database as data frame\n",
    "\n",
    "Just like the other databases, the object can be exported into a `pandas.DataFrame`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:45:46.897012Z",
     "start_time": "2022-12-03T15:45:24.178357Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>parent</th>\n",
       "      <th>database</th>\n",
       "      <th>scope</th>\n",
       "      <th>aspect</th>\n",
       "      <th>source</th>\n",
       "      <th>uniprot</th>\n",
       "      <th>genesymbol</th>\n",
       "      <th>entity_type</th>\n",
       "      <th>consensus_score</th>\n",
       "      <th>transmitter</th>\n",
       "      <th>receiver</th>\n",
       "      <th>secreted</th>\n",
       "      <th>plasma_membrane_transmembrane</th>\n",
       "      <th>plasma_membrane_peripheral</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q96JP9</td>\n",
       "      <td>CDHR1</td>\n",
       "      <td>protein</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q9P126</td>\n",
       "      <td>CLEC1B</td>\n",
       "      <td>protein</td>\n",
       "      <td>8</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q13585</td>\n",
       "      <td>GPR50</td>\n",
       "      <td>protein</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q8N9I0</td>\n",
       "      <td>SYT2</td>\n",
       "      <td>protein</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>O43614</td>\n",
       "      <td>HCRTR2</td>\n",
       "      <td>protein</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>A6NJY1</td>\n",
       "      <td>SLC9B1P1</td>\n",
       "      <td>protein</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q5RI15</td>\n",
       "      <td>COX20</td>\n",
       "      <td>protein</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q13948</td>\n",
       "      <td>CUX1</td>\n",
       "      <td>protein</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q8NGK4</td>\n",
       "      <td>OR52K1</td>\n",
       "      <td>protein</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>transmembrane</td>\n",
       "      <td>transmembrane</td>\n",
       "      <td>UniProt_location</td>\n",
       "      <td>generic</td>\n",
       "      <td>locational</td>\n",
       "      <td>resource_specific</td>\n",
       "      <td>Q8IYS2</td>\n",
       "      <td>KIAA2013</td>\n",
       "      <td>protein</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        category         parent          database    scope      aspect  \\\n",
       "0  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "1  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "2  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "3  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "4  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "5  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "6  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "7  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "8  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "9  transmembrane  transmembrane  UniProt_location  generic  locational   \n",
       "\n",
       "              source uniprot genesymbol entity_type  consensus_score  \\\n",
       "0  resource_specific  Q96JP9      CDHR1     protein                6   \n",
       "1  resource_specific  Q9P126     CLEC1B     protein                8   \n",
       "2  resource_specific  Q13585      GPR50     protein                6   \n",
       "3  resource_specific  Q8N9I0       SYT2     protein                7   \n",
       "4  resource_specific  O43614     HCRTR2     protein                6   \n",
       "5  resource_specific  A6NJY1   SLC9B1P1     protein                4   \n",
       "6  resource_specific  Q5RI15      COX20     protein                5   \n",
       "7  resource_specific  Q13948       CUX1     protein                5   \n",
       "8  resource_specific  Q8NGK4     OR52K1     protein                6   \n",
       "9  resource_specific  Q8IYS2   KIAA2013     protein                7   \n",
       "\n",
       "   transmitter  receiver  secreted  plasma_membrane_transmembrane  \\\n",
       "0        False     False     False                           True   \n",
       "1        False     False     False                           True   \n",
       "2        False     False     False                           True   \n",
       "3        False     False     False                          False   \n",
       "4        False     False     False                           True   \n",
       "5        False     False     False                          False   \n",
       "6        False     False     False                          False   \n",
       "7        False     False     False                          False   \n",
       "8        False     False     False                          False   \n",
       "9        False     False     False                           True   \n",
       "\n",
       "   plasma_membrane_peripheral  \n",
       "0                       False  \n",
       "1                       False  \n",
       "2                       False  \n",
       "3                       False  \n",
       "4                       False  \n",
       "5                       False  \n",
       "6                       False  \n",
       "7                       False  \n",
       "8                       False  \n",
       "9                       False  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ic.make_df()\n",
    "ic.df[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Browse intercell categories\n",
    "\n",
    "Use the `select` method to access intercell classes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:27:15.163519Z",
     "start_time": "2022-12-02T15:27:15.118303Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AnnotationGroup `neurotensin` from HGNC, 2 elements>"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ic.select(definition = 'neurotensin', parent = 'receptor')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Proteins in each category can be listed with their descriptions from UniProt. Loading the UniProt datasheets for each protein is a slow process, we don't recomment calling this method on more than a few dozens of proteins."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:35:58.169943Z",
     "start_time": "2022-12-02T15:35:57.449110Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====> [2 proteins] <=====\n",
      "╒═══════╤════════╤══════════════╤══════════╤══════════╤═════════════╤══════════════╤════════════╤══════════════╕\n",
      "│   No. │ ac     │ genesymbol   │   length │   weight │ full_name   │ function_o   │ keywords   │ subcellula   │\n",
      "│       │        │              │          │          │             │ r_genecard   │            │ r_location   │\n",
      "│       │        │              │          │          │             │ s            │            │              │\n",
      "╞═══════╪════════╪══════════════╪══════════╪══════════╪═════════════╪══════════════╪════════════╪══════════════╡\n",
      "│     1 │ O95665 │ NTSR2        │      410 │    45385 │ Neurotensi  │ Receptor     │ Cell       │ Cell         │\n",
      "│       │        │              │          │          │ n receptor  │ for the tr   │ membrane,  │ membrane;    │\n",
      "│       │        │              │          │          │ type 2      │ idecapepti   │ Disulfide  │ Multi-pass   │\n",
      "│       │        │              │          │          │             │"
     ]
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 7598 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ic.show('neurotensin', parent = 'receptor')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gene Ontology <a class=\"anchor\" id=\"gene-ontology\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`pypath.utils.go` is an almost standalone module for management of the Gene Ontology tree and annotations. The main objects here are `GeneOntology` and `GOAnnotation`. The former represents the ontology tree, i.e. terms and their relationships, the latter their assignment to gene products. Both provides many versatile methods for querying."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:36:46.059721Z",
     "start_time": "2022-12-02T15:36:44.796832Z"
    }
   },
   "outputs": [],
   "source": [
    "from pypath.utils import go\n",
    "goa = go.GOAnnotation()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:36:48.863117Z",
     "start_time": "2022-12-02T15:36:48.859703Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pypath.utils.go.GeneOntology at 0x689946b55570>"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "goa.ontology # the GeneOntology object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:36:50.648586Z",
     "start_time": "2022-12-02T15:36:50.644257Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pypath.utils.go.GOAnnotation at 0x68991cdc9b40>"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "goa # the GOAnnotation object"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Among many others, the most versatile method is `select` which is able to select the annotated gene products by various expressions built from GO terms or IDs. It understands `AND`, `OR`, `NOT` and parentheses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:36:55.072995Z",
     "start_time": "2022-12-02T15:36:54.983381Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['P21333', 'P80108', 'P62258', 'Q9NRX4', 'P54710', 'Q8NER1', 'P01303']\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"(cell surface OR\n",
    "        external side of plasma membrane OR\n",
    "        extracellular region) AND\n",
    "        (regulation of transmembrane transporter activity OR\n",
    "        channel regulator activity)\"\"\"\n",
    "result = goa.select(query)\n",
    "print(list(result)[:7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:36:56.986264Z",
     "start_time": "2022-12-02T15:36:56.974560Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'GO:0001507',\n",
       " 'GO:0001527',\n",
       " 'GO:0003351',\n",
       " 'GO:0003355',\n",
       " 'GO:0005201',\n",
       " 'GO:0005576',\n",
       " 'GO:0005577',\n",
       " 'GO:0005582',\n",
       " 'GO:0005583',\n",
       " 'GO:0005584',\n",
       " 'GO:0005585',\n",
       " 'GO:0005586',\n",
       " 'GO:0005587',\n",
       " 'GO:0005588',\n",
       " 'GO:0005590',\n",
       " 'GO:0005591',\n",
       " 'GO:0005592',\n",
       " 'GO:0005595',\n",
       " 'GO:0005596',\n",
       " 'GO:0005599',\n",
       " 'GO:0005601',\n",
       " 'GO:0005602',\n",
       " 'GO:0005604',\n",
       " 'GO:0005606',\n",
       " 'GO:0005607',\n",
       " 'GO:0005608',\n",
       " 'GO:0005609',\n",
       " 'GO:0005610',\n",
       " 'GO:0005611',\n",
       " 'GO:0005612',\n",
       " 'GO:0005614',\n",
       " 'GO:0005615',\n",
       " 'GO:0005616',\n",
       " 'GO:0006858',\n",
       " 'GO:0006859',\n",
       " 'GO:0006860',\n",
       " 'GO:0009519',\n",
       " 'GO:0010367',\n",
       " 'GO:0016914',\n",
       " 'GO:0016942',\n",
       " 'GO:0020003',\n",
       " 'GO:0020004',\n",
       " 'GO:0020005',\n",
       " 'GO:0020006',\n",
       " 'GO:0030020',\n",
       " 'GO:0030021',\n",
       " 'GO:0030023',\n",
       " 'GO:0030197',\n",
       " 'GO:0030345',\n",
       " 'GO:0030934',\n",
       " 'GO:0030935',\n",
       " 'GO:0030938',\n",
       " 'GO:0031012',\n",
       " 'GO:0031395',\n",
       " 'GO:0032311',\n",
       " 'GO:0032579',\n",
       " 'GO:0033165',\n",
       " 'GO:0033166',\n",
       " 'GO:0034358',\n",
       " 'GO:0034359',\n",
       " 'GO:0034360',\n",
       " 'GO:0034361',\n",
       " 'GO:0034362',\n",
       " 'GO:0034363',\n",
       " 'GO:0034364',\n",
       " 'GO:0034365',\n",
       " 'GO:00343"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 3104 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "goa.ontology.get_all_descendants('GO:0005576')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Protein complexes <a class=\"anchor\" id=\"complexes\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `pypath.complex` module builds a non-redundant list of complexes from about 12 original resources. Complexes are unique considering their set of components, and optionally carry stoichiometry information. Homomultimers are also included, hence some complexes consist only of a single kind of protein. The database is an instance of `pypath.core.complex.ComplexAggregator` object and the built by the `pypath.core.complex.get_db` function.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> it is recommended to access databases <a href=\"#co-dbmanager\">by the manager</a>. Running the code below takes really long and does not save or reload the database, it builds a fresh copy each time.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:39:31.069128Z",
     "start_time": "2022-12-02T15:39:30.708117Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Complex database: 28173 complexes>"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pypath.core import complex\n",
    "co = complex.get_db()\n",
    "co.update_index()\n",
    "co"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To retrieve all complexes containing a specific protein, here MTOR:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:39:42.824102Z",
     "start_time": "2022-12-02T15:39:42.819187Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{Complex: COMPLEX:O00141_O15530_O75879_P23443_P34931_P42345_Q6R327_Q8N122_Q9BPZ7_Q9BVC4_Q9H672,\n",
       " Complex: COMPLEX:O00141_O15530_P07900_P23443_P31749_P31751_P42345_P78527_Q05513_Q05655_Q6R327_Q8N122_Q9BPZ7_Q9BVC4,\n",
       " Complex: COMPLEX:O00141_O15530_P0CG47_P0CG48_P23443_P42345_Q15118_Q6R327_Q8N122_Q96BR1_Q9BPZ7_Q9BVC4,\n",
       " Complex: COMPLEX:O00141_O15530_P23443_P42345_Q15118_Q6R327_Q8N122_Q96BR1_Q96J02_Q9BPZ7_Q9BVC4,\n",
       " Complex: COMPLEX:O00141_O75879_P0CG48_P23443_P34931_P42345_P62753_Q6R327_Q8N122_Q9BPZ7_Q9BVC4_Q9NY26,\n",
       " Complex: COMPLEX:O00141_P0CG48_P23443_P36894_P42345_P62942_P68106_Q15427_Q6R327_Q8N122_Q9BPZ7_Q9BVC4,\n",
       " Complex: COMPLEX:O00141_P0CG48_P23443_P42345_P46781_P62753_Q6R327_Q8N122_Q96KQ7_Q9BPZ7_Q9BVC4_Q9NY26,\n",
       " Complex: COMPLEX:O00141_P0CG48_P23443_P42345_P62753_P62942_Q6R327_Q8N122_Q9BPZ7_Q9BVC4_Q9NY26,\n",
       " Complex: COMPLEX:O00141_P0CG48_P23443_P42345_P62753_Q15172_Q6R327_Q8IW41_Q9BPZ7_Q9BVC4_Q9H672,\n",
       " Complex: COMPLEX:O00141_P0CG48_P23443_P42345_P62753_Q6R327_Q70Z35_Q8N122_Q8TCU6_Q9BPZ7"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 5348 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "co.proteins['P42345']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note some of the complexes have human readable names, these are preferred at printing if available from any of the databases. Otherwise the complexes are labelled by `COMPLEX:list-of-components`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Protein complex objects\n",
    "\n",
    "Take a closer look on one complex object. The hash of the is equivalent with the string representation below, where the UniProt IDs are unique and alphabetically sorted. Hence you can look up complexes using strings as keys despite the dict keys are in fact `pypath.intera.Complex` objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:41:36.557282Z",
     "start_time": "2022-12-02T15:41:36.552728Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Complex CBP/p300: COMPLEX:Q09472_Q92793"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cplex = co.complexes['COMPLEX:Q09472_Q92793']\n",
    "cplex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:41:38.271651Z",
     "start_time": "2022-12-02T15:41:38.267131Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Q92793': 1, 'Q09472': 1}"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cplex.components # stoichiometry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:41:39.693944Z",
     "start_time": "2022-12-02T15:41:39.688511Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Signor'}"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cplex.sources # resources"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Protein complex data frame\n",
    "\n",
    "The database can be exported into a `pandas.DataFrame`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-03T15:47:16.115381Z",
     "start_time": "2022-12-03T15:47:12.712922Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>components</th>\n",
       "      <th>components_genesymbols</th>\n",
       "      <th>stoichiometry</th>\n",
       "      <th>sources</th>\n",
       "      <th>references</th>\n",
       "      <th>identifiers</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NFY</td>\n",
       "      <td>P23511_P25208_Q13952</td>\n",
       "      <td>NFYA_NFYB_NFYC</td>\n",
       "      <td>1:1:1</td>\n",
       "      <td>CORUM;Compleat;PDB;Signor;ComplexPortal;hu.MAP...</td>\n",
       "      <td>15243141;14755292;9372932</td>\n",
       "      <td>Signor:SIGNOR-C1;CORUM:4478;Compleat:HC1449;in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>mTORC2</td>\n",
       "      <td>P68104_P85299_Q6R327_Q8TB45_Q9BVC4</td>\n",
       "      <td>DEPTOR_EEF1A1_MLST8_PRR5_RICTOR</td>\n",
       "      <td>0:0:0:0:0</td>\n",
       "      <td>Signor</td>\n",
       "      <td></td>\n",
       "      <td>Signor:SIGNOR-C2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>mTORC1</td>\n",
       "      <td>P42345_Q8N122_Q8TB45_Q96B36_Q9BVC4</td>\n",
       "      <td>AKT1S1_DEPTOR_MLST8_MTOR_RPTOR</td>\n",
       "      <td>0:0:0:0:0</td>\n",
       "      <td>Signor</td>\n",
       "      <td></td>\n",
       "      <td>Signor:SIGNOR-C3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SCF-betaTRCP</td>\n",
       "      <td>P63208_Q13616_Q9Y297</td>\n",
       "      <td>BTRC_CUL1_SKP1</td>\n",
       "      <td>1:1:1</td>\n",
       "      <td>CORUM;Compleat;Signor</td>\n",
       "      <td>9990852</td>\n",
       "      <td>Signor:SIGNOR-C5;CORUM:227;Compleat:HC757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CBP/p300</td>\n",
       "      <td>Q09472_Q92793</td>\n",
       "      <td>CREBBP_EP300</td>\n",
       "      <td>0:0</td>\n",
       "      <td>Signor</td>\n",
       "      <td></td>\n",
       "      <td>Signor:SIGNOR-C6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28168</th>\n",
       "      <td>Npnt complex 2</td>\n",
       "      <td>Q5SZK8_Q6UXI9_Q86XX4</td>\n",
       "      <td>FRAS1_FREM2_NPNT</td>\n",
       "      <td>0:0:0</td>\n",
       "      <td>CellChatDB</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28169</th>\n",
       "      <td>NRP1_NRP2</td>\n",
       "      <td>O14786_O60462_Q9Y4D7</td>\n",
       "      <td>NRP1_NRP2_PLXND1</td>\n",
       "      <td>0:0:0</td>\n",
       "      <td>CellChatDB</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28170</th>\n",
       "      <td>NRP2_PLXNA2</td>\n",
       "      <td>O60462_O75051</td>\n",
       "      <td>NRP2_PLXNA2</td>\n",
       "      <td>0:0</td>\n",
       "      <td>CellChatDB</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28171</th>\n",
       "      <td>NRP2_PLXNA4</td>\n",
       "      <td>O60462_Q9HCM2</td>\n",
       "      <td>NRP2_PLXNA4</td>\n",
       "      <td>0:0</td>\n",
       "      <td>CellChatDB</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28172</th>\n",
       "      <td>PTCH2_SMO</td>\n",
       "      <td>Q99835_Q9Y6C5</td>\n",
       "      <td>PTCH2_SMO</td>\n",
       "      <td>0:0</td>\n",
       "      <td>CellChatDB</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>28173 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 name                          components  \\\n",
       "0                 NFY                P23511_P25208_Q13952   \n",
       "1              mTORC2  P68104_P85299_Q6R327_Q8TB45_Q9BVC4   \n",
       "2              mTORC1  P42345_Q8N122_Q8TB45_Q96B36_Q9BVC4   \n",
       "3        SCF-betaTRCP                P63208_Q13616_Q9Y297   \n",
       "4            CBP/p300                       Q09472_Q92793   \n",
       "...               ...                                 ...   \n",
       "28168  Npnt complex 2                Q5SZK8_Q6UXI9_Q86XX4   \n",
       "28169       NRP1_NRP2                O14786_O60462_Q9Y4D7   \n",
       "28170     NRP2_PLXNA2                       O60462_O75051   \n",
       "28171     NRP2_PLXNA4                       O60462_Q9HCM2   \n",
       "28172       PTCH2_SMO                       Q99835_Q9Y6C5   \n",
       "\n",
       "                components_genesymbols stoichiometry  \\\n",
       "0                       NFYA_NFYB_NFYC         1:1:1   \n",
       "1      DEPTOR_EEF1A1_MLST8_PRR5_RICTOR     0:0:0:0:0   \n",
       "2       AKT1S1_DEPTOR_MLST8_MTOR_RPTOR     0:0:0:0:0   \n",
       "3                       BTRC_CUL1_SKP1         1:1:1   \n",
       "4                         CREBBP_EP300           0:0   \n",
       "...                                ...           ...   \n",
       "28168                 FRAS1_FREM2_NPNT         0:0:0   \n",
       "28169                 NRP1_NRP2_PLXND1         0:0:0   \n",
       "28170                      NRP2_PLXNA2           0:0   \n",
       "28171                      NRP2_PLXNA4           0:0   \n",
       "28172                        PTCH2_SMO           0:0   \n",
       "\n",
       "                                                 sources  \\\n",
       "0      CORUM;Compleat;PDB;Signor;ComplexPortal;hu.MAP...   \n",
       "1                                                 Signor   \n",
       "2                                                 Signor   \n",
       "3                                  CORUM;Compleat;Signor   \n",
       "4                                                 Signor   \n",
       "...                                                  ...   \n",
       "28168                                         CellChatDB   \n",
       "28169                                         CellChatDB   \n",
       "28170                                         CellChatDB   \n",
       "28171                                         CellChatDB   \n",
       "28172                                         CellChatDB   \n",
       "\n",
       "                      references  \\\n",
       "0      15243141;14755292;9372932   \n",
       "1                                  \n",
       "2                                  \n",
       "3                        9990852   \n",
       "4                                  \n",
       "...                          ...   \n",
       "28168                              \n",
       "28169                              \n",
       "28170                              \n",
       "28171                              \n",
       "28172                              \n",
       "\n",
       "                                             identifiers  \n",
       "0      Signor:SIGNOR-C1;CORUM:4478;Compleat:HC1449;in...  \n",
       "1                                       Signor:SIGNOR-C2  \n",
       "2                                       Signor:SIGNOR-C3  \n",
       "3              Signor:SIGNOR-C5;CORUM:227;Compleat:HC757  \n",
       "4                                       Signor:SIGNOR-C6  \n",
       "...                                                  ...  \n",
       "28168                                                     \n",
       "28169                                                     \n",
       "28170                                                     \n",
       "28171                                                     \n",
       "28172                                                     \n",
       "\n",
       "[28173 rows x 7 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "co.make_df()\n",
    "co.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Saving datasets as pickles <a class=\"anchor\" id=\"pickle\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The large datasets above are compiled from many resources. Even if these are already available in the cache, the data processing often takes longer than convenient, e.g. from a few minutes up to half an hour. Most of the data integration objects in `pypath` provide methods to save and load their contents as pickle dumps. In fact, the database manager does this all the time, in a coordinated way -- for this reason, the methods below should be used only with good reason, and relying on the database manager is preferred."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for `pypath.annot.AnnotationTable` objects:\n",
    "a.save_to_pickle('myannots.pickle')\n",
    "a = annot.AnnotationTable(pickle_file = 'myannots.pickle')\n",
    "# for `pypath.complex.ComplexAggregator` objects:\n",
    "complexdb.save_to_pickle('mycomplexes.pickle')\n",
    "complexdb = complex.ComplexAggregator(pickle_file = 'mycomplexes.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Log messages and sessions <a class=\"anchor\" id=\"log-session\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In `pypath` all modules sends messages to a log file named by default by the session ID (a 5 char random string). The default path to the log file is `./pypath_log/pypath-xxxxx.log` where `xxxxx` is the session ID.\n",
    "\n",
    "<div class=\"alert alert-block alert-warning\"><b>Warning:</b> The logger of <em>pypath</em> is really verbose, the log files can grow huge: several tens of thousands of lines, few MBs. It is recommended to empty the <em>pypath_log</em> directories time to time.</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic info about the session\n",
    "\n",
    "The `info` function prints the most important information about the current session:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:41:55.746575Z",
     "start_time": "2022-12-02T15:41:55.743592Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 16:41:55] [pypath] \n",
      "\t- session ID: `l0n17`\n",
      "\t- working directory: `/home/denes/pypath/notebooks`\n",
      "\t- logfile: `/home/denes/pypath/notebooks/pypath_log/pypath-l0n17.log`\n",
      "\t- pypath version: 0.14.31\n"
     ]
    }
   ],
   "source": [
    "import pypath\n",
    "pypath.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another function prints a disclaimer about licenses. Until recently this message was printed every time upon import, it is still important, but we removed it as in certain situations it can be annoying."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:41:59.555630Z",
     "start_time": "2022-12-02T15:41:59.552243Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\t=== d i s c l a i m e r ===\n",
      "\n",
      "\tAll data accessed through this module,\n",
      "\teither as redistributed copy or downloaded using the\n",
      "\tprogrammatic interfaces included in the present module,\n",
      "\tare free to use at least for academic research or\n",
      "\teducation purposes.\n",
      "\tPlease be aware of the licenses of all the datasets\n",
      "\tyou use in your analysis, and please give appropriate\n",
      "\tcredits for the original sources when you publish your\n",
      "\tresults. To find out more about data sources please\n",
      "\tlook at `pypath/resources/data/resources.json` or\n",
      "\thttps://omnipathdb.org/info and \n",
      "\t`pypath.resources.urls.urls`.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "pypath.disclaimer()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the log file\n",
    "\n",
    "Calling ``pypath.log`` opens the logfile by the default console application for paginating text files (in GNU systems typically ``less``):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:42:08.424260Z",
     "start_time": "2022-12-02T15:42:08.354352Z"
    }
   },
   "outputs": [],
   "source": [
    "pypath.log()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The logger and the log file are bound to the session (the 5 random characters is the session ID):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:42:27.611191Z",
     "start_time": "2022-12-02T15:42:27.605545Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Session l0n17>"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pypath.session"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The logger:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:42:46.724382Z",
     "start_time": "2022-12-02T15:42:46.717248Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Logger [/home/denes/pypath/notebooks/pypath_log/pypath-l0n17.log]"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pypath.session.log"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The path to the log file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:42:49.629139Z",
     "start_time": "2022-12-02T15:42:49.621082Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/denes/pypath/notebooks/pypath_log/pypath-l0n17.log'"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pypath.session.log.fname"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logging to the console\n",
    "\n",
    "Each log message has a numeric priority level, and messages with lower level than a threshold are printed to the console. By default only important warnings are dispatched to the console. To log everything to the console, set the threshold to a large number:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:42:56.120444Z",
     "start_time": "2022-12-02T15:42:55.787700Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 16:42:55] [curl] Creating Curl object to retrieve data from `https://signor.uniroma2.it/download_complexes.php`\n",
      "[2022-12-02 16:42:55] [curl] Cache file path: `/home/denes/.pypath/cache/d7b8673e83e43a01c533f9de5a2b04b9-download_complexes.php`\n",
      "[2022-12-02 16:42:55] [curl] Cache file found, no need for download.\n",
      "[2022-12-02 16:42:55] [curl] Opening plain text file `/home/denes/.pypath/cache/d7b8673e83e43a01c533f9de5a2b04b9-download_complexes.php`.\n",
      "[2022-12-02 16:42:55] [curl] Creating Curl object to retrieve data from `https://signor.uniroma2.it/download_complexes.php`\n",
      "[2022-12-02 16:42:55] [curl] Cache file path: `/home/denes/.pypath/cache/d7b8673e83e43a01c533f9de5a2b04b9-download_complexes.php`\n",
      "[2022-12-02 16:42:55] [curl] Cache file found, no need for download.\n",
      "[2022-12-02 16:42:55] [curl] Opening plain text file `/home/denes/.pypath/cache/d7b8673e83e43a01c533f9de5a2b04b9-download_complexes.php`.\n",
      "[2022-12-02 16:42:55] [curl] Creating Curl object to retrieve data from `https"
     ]
    },
    {
     "data": {
      "text/html": [
       "<em>Output truncated: showing 1000 of 1046 characters</em>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pypath.session.log.console_level = 10\n",
    "\n",
    "from pypath.inputs import signor\n",
    "\n",
    "si = signor.signor_interactions()\n",
    "pypath.session.log.console_level = -1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Disable logging\n",
    "\n",
    "To avoid creation of a log file (and the directory `pypath_log`) set the environment variable `PYPATH_LOG` or the `builtins.PYPATH_LOG` attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shell:\n",
    "export PYPATH_LOG=\"/dev/null\"\n",
    "# then, start Python and use pypath"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:43:10.716053Z",
     "start_time": "2022-12-02T15:43:10.710658Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import builtins\n",
    "builtins.PYPATH_LOG=os.devnull\n",
    "import pypath"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write to the log\n",
    "\n",
    "#### Sending a single message\n",
    "\n",
    "First we change the console level so we can see the log messages. The label is optional. The priority of the message is given by the `level`, notice that the second message won't be printed to the console as its level is higher than 10:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:43:13.777984Z",
     "start_time": "2022-12-02T15:43:13.771286Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 16:43:13] [book] Greetings from the pypath tutorial notebook! :)\n"
     ]
    }
   ],
   "source": [
    "pypath.session.log.console_level = 10\n",
    "pypath.session.log.msg('Greetings from the pypath tutorial notebook! :)', label = 'book')\n",
    "pypath.session.log.msg('Not important, not shown on console but printed to the logfile.', level = 11)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Connect a module or class to the pypath logger\n",
    "\n",
    "The preferred way of connecting to the logger is to make a class inherit from the `Logger` class. Here the `name` will be the default label for all messages coming from the instances of this class:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:43:17.429331Z",
     "start_time": "2022-12-02T15:43:17.425872Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 16:43:17] [child] Have a nice day! :D\n"
     ]
    }
   ],
   "source": [
    "from pypath.share import session\n",
    "\n",
    "class ChildOfLogger(session.Logger):\n",
    "    \n",
    "    def __init__(self):\n",
    "        \n",
    "        session.Logger.__init__(self, name = 'child')\n",
    "    \n",
    "    def say_something(self):\n",
    "        \n",
    "        self._log('Have a nice day! :D')\n",
    "\n",
    "\n",
    "col = ChildOfLogger()\n",
    "col.say_something()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, a logger can be created anywhere and used from any module or function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:43:20.372472Z",
     "start_time": "2022-12-02T15:43:20.367496Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2022-12-02 16:43:20] [mylogger] Message from a stray logger\n"
     ]
    }
   ],
   "source": [
    "from pypath.share import session\n",
    "\n",
    "_logger = session.Logger(name = 'mylogger')\n",
    "_log = _logger._log\n",
    "\n",
    "_log('Message from a stray logger')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally we just set the console level to a lower value, to avoid flooding the rest of this book with log messages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-12-02T15:43:23.621368Z",
     "start_time": "2022-12-02T15:43:23.615302Z"
    }
   },
   "outputs": [],
   "source": [
    "pypath.session.log.console = -1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BEL export <a class=\"anchor\" id=\"bel\"></a>\n",
    "\n",
    "<div class=\"alert alert-block alert-danger\"><b>Warning:</b> This section hasn't been thoroughly revised for long time, some parts might be outdated or broken.</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Biological Expression Language (BEL, https://bel-commons.scai.fraunhofer.de/) is a versatile description language to capture relationships between various biological entities spanning wide range of the levels of biological organization. `pypath` has a dedicated module to convert the network and the enzyme-substrate interactions to BEL format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypath.legacy import main\n",
    "from pypath.resources import data_formats\n",
    "from pypath.omnipath import bel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa = main.PyPath()\n",
    "pa.init_network(data_formats.pathway)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can provide one or more resources to the `Bel` class. Supported resources currently are `pypath.main.PyPath` and `pypath.ptm.PtmAggregator`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b = bel.Bel(resource = pa)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From the resources we compile a `BELGraph` object which provides a Python interface for various operations and you can also export the data in BEL format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b.main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b.bel_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b.bel_graph.summarize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b.export_relationships('omnipath_pathways.bel')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('omnipath_pathways.bel', 'r') as fp:\n",
    "    bel_str = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(bel_str[:333])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CellPhoneDB export <a class=\"anchor\" id=\"cellphonedb\"></a>\n",
    "\n",
    "<div class=\"alert alert-block alert-danger\"><b>Warning:</b> This section hasn't been thoroughly revised for long time, some parts might be outdated or broken.</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CellPhoneDB is a statistical method and a database for inferring inter-cellular communication pathways between specific cell types from single-cell data. OmniPath/pypath uses CellPhoneDB as a resource for interaction, protein complex and annotation data. Apart from this, pypath is able to export its data in the appropriate format to provide input for the CellPhoneDB Python module. For this you can use the `pypath.cellphonedb` module:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypath.omnipath import cellphonedb\n",
    "from pypath.share import settings\n",
    "\n",
    "settings.setup(network_expand_complexes = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here you can provide parameters for the network or provide an already built network. Also you can provide the datasets as pickles to make them load really fast. Otherwise this step will take quite long."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = cellphonedb.CellPhoneDB()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can access each of the CellPhoneDB input files as a `pandas.DataFrame` and also they've been exported to csv files. For example the `interaction_input.csv` contains interactions from all the resources used for building the network (here Signor, SingnaLink, etc.):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.interaction_dataframe[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The proteins and complexes are annotated (transmembrane, peripheral, secreted, etc.) using data from the `pypath.intercell` module (identical to the http://omnipathdb.org/intercell query of the web service):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.protein_dataframe[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The legacy *igraph*-based network object <a class=\"anchor\" id=\"legacy\"></a>\n",
    "\n",
    "<div class=\"alert alert-block alert-danger\"><b>Warning:</b> This section hasn't been thoroughly revised for long time, some parts might be outdated or broken.</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Until about 2019 (before pypath version 0.9) ``pypath`` used an `igraph.Graph` object ([igraph.org](http://igraph.org/)) to organize all data structures around. This legacy API still present in ``pypath.legacy.main``, however it is not maintained. This section of the book is still here, but will be removed soon, along with the `legacy` module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No module `cairo` available.\n",
      "Some plotting functionalities won't be accessible.\n"
     ]
    }
   ],
   "source": [
    "from pypath.legacy import main"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa = main.PyPath()\n",
    "#pa.load_omnipath() # This is commented out because it takes > 1h \n",
    "                    # to run it for the first time due to the vast\n",
    "                    # amount of data download.\n",
    "                    # Once you populated the cache it still takes\n",
    "                    # approx. 30 min to build the entire OmniPath\n",
    "                    # as the process consists of quite some data\n",
    "                    # processing. If you dump it in a pickle, you\n",
    "                    # can load the network in < 1 min"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### I just want a network quickly and play around with *pypath* <a class=\"anchor\" id=\"legacy-quick-start\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can find the predefined formats in the ``pypath.resources.network`` module. For example, to load one resource from there, let's say SIGNOR:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypath.legacy import main\n",
    "from pypath.resources import network as netres\n",
    "pa = main.PyPath()\n",
    "pa.load_resources({'signor': netres.pathway['signor']})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or to load all *activity flow* resources with *literature references:*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypath.legacy import main\n",
    "from pypath.resources import network as netres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa = main.PyPath()\n",
    "pa.init_network(netres.pathway)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or to load all *activity flow* resources, including the ones without literature references:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa = main.PyPath()\n",
    "pa.init_network(data_formats.pathway_all)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How do I build networks from any data with *pypath*? <a class=\"anchor\" id=\"legacy-quick-start-2\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we show how to build a network from your own files. The advantage of building network with pypath is that you don't need to worry about merging redundant elements, neither about different formats and identifiers. Let's say you have two files with network data:\n",
    "\n",
    "**network1.csv**\n",
    "\n",
    "    entrezA,entrezB,effect\n",
    "    1950,1956,inhibition\n",
    "    5290,207,stimulation\n",
    "    207,2932,inhibition\n",
    "    1956,5290,stimulation\n",
    "\n",
    "**network2.sif**\n",
    "\n",
    "    EGF + EGFR\n",
    "    EGFR + PIK3CA\n",
    "    EGFR + SOS1\n",
    "    PIK3CA + RAC1\n",
    "    RAC1 + MAP3K1\n",
    "    SOS1 + HRAS\n",
    "    HRAS + MAP3K1\n",
    "    PIK3CA + AKT1\n",
    "    AKT1 - GSK3B\n",
    "    \n",
    "*Note: you need to create these files in order to load them.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Defining input formats <a class=\"anchor\" id=\"input-formats\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pypath\n",
    "import pypath.iinput_formats as input_formats\n",
    "\n",
    "input1 = input_formats.ReadSettings(\n",
    "    name = 'egf1',\n",
    "    input = 'network1.csv',\n",
    "    header = True,\n",
    "    separator = ',',\n",
    "    id_col_a = 0,\n",
    "    id_col_b = 1,\n",
    "    id_type_a = 'entrez',\n",
    "    id_type_b = 'entrez',\n",
    "    sign = (2, 'stimulation', 'inhibition'),\n",
    "    ncbi_tax_id = 9606,\n",
    ")\n",
    "\n",
    "input2 = input_formats.ReadSettings(\n",
    "    name = 'egf2',\n",
    "    input = 'network2.sif',\n",
    "    separator = ' ',\n",
    "    id_col_a = 0,\n",
    "    id_col_b = 2,\n",
    "    id_type_a = 'genesymbol',\n",
    "    id_type_b = 'genesymbol',\n",
    "    sign = (1, '+', '-'),\n",
    "    ncbi_tax_id = 9606,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Creating PyPath object and loading the 2 test files <a class=\"anchor\" id=\"toy-example\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = {\n",
    "    'egf1': input1,\n",
    "    'egf2': input2\n",
    "}\n",
    "\n",
    "pa = main.PyPath()\n",
    "pa.reload()\n",
    "pa.init_network(lst = inputs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Structure of the legacy network object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypath.legacy import main as legacy\n",
    "pa = legacy.PyPath()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of edges and nodes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.ecount, pa.vcount"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The edge and vertex sequences you can access in the `es` and `vs` attributes, you can iterate these or index by integers. The edge and vertex attributes you can access by string keys. E.g. get the sources of edge 0:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.graph.es[81]['sources']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Directions and signs <a class=\"anchor\" id=\"directions\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By default the `igraph` object is undirected but it carries all direction information in Python objects assigned to each edge. Pypath can convert it to a directed `igraph` object, but you still need the `Direction` objects to have the signs, as `igraph` has no signed network representation. Certain methods need the directed `igraph` object and they will automatically create it, but you can create it manually:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.get_directed()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You find the directed network in the `pa.dgraph` attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.dgraph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's take a look on the `pypath.main.Direction` objects which contain details about directions and signs. First as an example, select a random edge:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge = pa.graph.es[3241]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `Direction` object is in the `dirs` edge attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = edge['dirs']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It has a method to print its content a human readable way:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pa.graph.es[3241]['dirs'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From this we see the databases phosphoELM and Signor agree that protein `P17252` has an effect on `Q15139` and Signor in addition tells us this effect is stimulatory. However in your scripts you can query the `Direction` objects a number of ways. Each `Direction` object calls the two possible directions either straight or reverse:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.straight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.reverse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It can tell you if one of these directions is supported by any of the network resources:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.get_dir(d.straight)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or it can return those resources:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.get_dir(d.straight, sources = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The opposite direction is not supported by any resource:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.get_dir(d.reverse, sources = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similar way the signs can be queried. The returned pair of boolean values mean if the interaction in this direction is stimulatory or inhibitory, respectively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.get_sign(d.straight)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or you can ask whether it is inhibition:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.is_inhibition(d.straight)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or if the interaction is directed at all:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.is_directed()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sometimes resources don't agree, for example one tells an interaction is inhibition while according to others it is stimulation; or one tells A effects B and another resource the other way around. Here we preserve all these potentially contradicting information in the `Direction` object and at the end you decide what to do with it depending on your purpose. If you want to get rid of ambiguity there is a method to get a consensus direction and sign which returns the attributes the most resources agree on: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d.consensus_edges()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Accessing nodes in the network <a class=\"anchor\" id=\"nodes\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In `igraph` the vertices are numbered but this numbering can change at certain operations. Instead the we can use the vertex attributes. In `PyPath` for proteins the `name` attribute is UniProt ID by default and the `label` is Gene Symbol."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.graph.vs['name'][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.graph.vs['label'][:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `PyPath` object offers a number of helper methods to access the nodes by their names. For example, `uniprot` or `up` returns the `igraph.Vertex` for a UniProt ID:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(pa.up('P00533'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similarly `genesymbol` or `gs` for Gene Symbols:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(pa.gs('ESR1'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each of these has a \"plural\" version:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(pa.gss(['MTOR', 'ATG16L2', 'ULK1'])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And a generic method where you can mix UniProts and Gene Symbols:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(pa.proteins(['MTOR', 'P00533'])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Querying relationships with our without causality <a class=\"anchor\" id=\"causality\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Above you could see how to query the directions and names of individual edges and nodes. Building on top of these, other methods give a way to query causality, i.e. which proteins are affected by an other one, and which others are its regulators. The example below returns the nodes PIK3CA is stimulated by, the `gs` prefix tells we query by the Gene Symbol:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pa.gs_stimulated_by('PIK3CA')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It returns a so called `_NamedVertexSeq` object, which you can get a series of `igraph.Vertex` objects or Gene Symbols or UniProt IDs from:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(pa.gs_stimulated_by('PIK3CA').gs())[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(pa.gs_stimulated_by('PIK3CA').up())[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note, the names of these methods are a bit contraintuitive, the for example the `gs_stimulates` returns the genes stimulated by PIK3CA:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(pa.gs_stimulates('PIK3CA').gs())[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'PIK3CA' in set(pa.affected_by('AKT1').gs())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are many similary methods, `inhibited_by` returns negative regulators, `affected_by` does not consider +/- signs, without `gs_` and `up_` prefixes you can provide either of these identifiers, `neighbors` does not consider the direction. At the end `.gs()` converts the result for a list of Gene Symbols, `up()` to UniProts, `.ids()` to vertex IDs and by default it yields `igraph.Vertex` objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(pa.neighbors('AKT1').ids())[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, with `neighborhood` methods return the indirect neighborhood in custom number of steps (however size of the neighborhood increases rapidly with number of steps):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(list(pa.neighborhood('ATG3', 1).gs()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(list(pa.neighborhood('ATG3', 2).gs()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(pa.neighborhood('ATG3', 3).gs()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(pa.neighborhood('ATG3', 4).gs()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accessing edges by identifiers <a class=\"anchor\" id=\"edge-lookup\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just like nodes also edges can be accessed by identifiers like Gene Symbols. `get_edge` returns an `igraph.Edge` if the edge exists otherwise `None`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(pa.get_edge('EGF', 'EGFR'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(pa.get_edge('EGF', 'P00533'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(pa.get_edge('EGF', 'AKT1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pa.get_edge('EGF', 'EGFR')['dirs'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Literature references <a class=\"anchor\" id=\"references\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Select a random edge and in the `references` attribute you find a list of references:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge = pa.get_edge( 'MAP1LC3B', 'SQSTM1')\n",
    "edge['references']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each reference has a PubMed ID:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge['references'][0].pmid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge['references'][0].open()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These 3 references come from 3 different databases, but there must be 2 overlaps between them:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge['refs_by_source']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plotting the network with *igraph* <a class=\"anchor\" id=\"plot\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we use the network created above (because it is reasonable size, not like the networks we could get from most of the network databases). Igraph has excellent plotting abilities built on top of the *cairo* library."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import igraph\n",
    "plot = igraph.plot(pa.graph, target = 'egf_network.png',\n",
    "            edge_width = 0.3, edge_color = '#777777',\n",
    "            vertex_color = '#97BE73', vertex_frame_width = 0,\n",
    "            vertex_size = 70.0, vertex_label_size = 15,\n",
    "            vertex_label_color = '#FFFFFF',\n",
    "            # due to a bug in either igraph or IPython, \n",
    "            # vertex labels are not visible on inline plots:\n",
    "            inline = False, margin = 120)\n",
    "from IPython.display import Image\n",
    "Image(filename='egf_network.png')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {
    "height": "1278px",
    "width": "347px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Contents",
   "title_sidebar": "The pypath book",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "511.6px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": true,
  "toc-showcode": false,
  "toc-showmarkdowntxt": false,
  "toc-showtags": false
 },
 "nbformat": 4,
 "nbformat_minor": 4
}