# -*- coding: utf-8 -*- """ Created on Mon 08/09/2021 Last updated on 02/15/2022 @author: mmaurya """ # Caution: User may have to change the index on the line below: html = list(soup.children)[1] # Import the relevant components from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter) import sys, os, re, csv; # set current dir to appropriate folder # os.chdir('/var/www/html/geneid'); import platform; osinfo = platform.platform(); #runfile('fetch_php_page.py') # on Windows (anaconda, spyder) # exec(open('fetch_php_page.py').read()) import numpy as np; import pandas as pd; import datetime; # if needed, install using pip3 or conda: #conda install requests #conda install beautifulsoup4 # Import key libraries import requests; from bs4 import BeautifulSoup; from inspect import currentframe, getframeinfo #frameinfo = getframeinfo(currentframe()); print("Line No. = {0:d}".format(frameinfo.lineno)); GeneListSplitChar = "__"; # This API returns, SYMBOL, GENENAME, ALIAS, ENTREZID, REFSEQ, ENSEMBL and UNIPROT #baseurl = "https://sc-cfdewebdev.sdsc.edu/geneid/geneid_proc_selcol_GET.php?"; baseurl = "https://bdcw.org/geneid/geneid_proc_selcol_GET.php?"; species = "hsa"; GeneList = ["AIM1", "VWF"]; # several gene, use __ e.g., AIM1__KLF2__E2F1 (since REFSEQ has _ in it) GeneIDType = "SYMBOL_OR_ALIAS"; # another example GeneList = ["NM_002156", "NM_199440"]; GeneIDType = "REFSEQ"; GeneListStr = GeneListSplitChar.join(GeneList); # "AIM1__VWF"; # if several gene, use __ e.g., AIM1__KLF2__E2F1 (since REFSEQ has _ in it) # Example complete URL #php_url = "https://sc-cfdewebdev.sdsc.edu/geneid/geneid_proc_selcol_GET.php?species=hsa&GeneListStr=AIM1__KLF4&GeneIDType=SYMBOL_OR_ALIAS"; php_url = baseurl + "species=" + species + "&" + "GeneListStr=" + GeneListStr + "&" + "GeneIDType=" + GeneIDType; def fetch_page_from_url(url_str): debug = 0; resp = requests.get(url_str); #http_respone 200 means OK status if(resp.status_code==200): if(debug): print("Successfully opened the web page") print("The contents are as follow :-\n") # we need a parser,Python built-in HTML parser is enough soup=BeautifulSoup(resp.text,'html.parser'); if(debug): print("Using html.parser:-\n"); print(soup); # print(soup.prettify()) if(debug): print("Printing object named soup") print(soup); # print(soup.prettify()) print("Printing type(soup)") print(type(soup)); print("Printing object named list(soup.children)") print(list(soup.children)) print("Printing elements of list(soup.children)") [print(item) for item in list(soup.children)] #html = list(soup.children)[0]; # index depends on actual content html = list(soup.children)[1]; # updated 2022/08/22: index depends on actual content if(debug): print("Printing object named html"); print(html); body = html.find_all('body')[0]; body_text = body.get_text(); if(debug): print(body); print(body_text); #body_text_lines1 = body_text.split('\n'); # https://pythonexamples.org/python-split-string-by-new-line/ body_text_lines = re.split('\n+', body_text); # remove empty lines body_text_lines = ' '.join(body_text_lines).split(); # get 1st one: if needed can search: https://www.delftstack.com/howto/python/grep-python/ oneline = body_text_lines[0]; #print("Using html5lib:-\n"); soup = BeautifulSoup(resp.content, 'html5lib'); print(soup.prettify()); return oneline; else: print("Error: Code: ".str(resp.statud_code)) print("Going to fetch php/html data and extract the output file url") file_url = fetch_page_from_url(php_url); print("Going to fetch php/html ... Done") # read using pandas print("Going to fetch table using the output file url") df = pd.read_csv(file_url, sep="\t") df_cols = list(df.columns); print("Converted Gene IDs are in the variable df (a pandas dataframe), see also df_cols") # df[['SYMBOL', 'REFSEQ', 'UNIPROT']] # a single column as a list: # df['SYMBOL'].tolist() # unique elements of a list: via np.array and np.unique # list(np.unique(np.array(df['SYMBOL'].tolist()))) # list(np.unique(np.array(df['UNIPROT'].tolist())))