# -*- coding: utf-8 -*-
"""
Created on Mon 08/09/2021
Last updated on 02/15/2022

@author: mmaurya
"""

# Caution: User may have to change the index on the line below: html = list(soup.children)[1]

# Import the relevant components
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)

import sys, os, re, csv;

# set current dir to appropriate folder
# os.chdir('/var/www/html/geneid');

import platform;
osinfo = platform.platform();
#runfile('fetch_php_page.py') # on Windows (anaconda, spyder)
# exec(open('fetch_php_page.py').read())


import numpy as np;
import pandas as pd;
import datetime;

# if needed, install using pip3 or conda:
#conda install requests
#conda install beautifulsoup4

# Import key libraries
import requests;
from bs4 import BeautifulSoup;

from inspect import currentframe, getframeinfo

#frameinfo = getframeinfo(currentframe()); print("Line No. = {0:d}".format(frameinfo.lineno));

GeneListSplitChar = "__";

# This API returns, SYMBOL, GENENAME, ALIAS, ENTREZID, REFSEQ, ENSEMBL and UNIPROT

#baseurl = "https://sc-cfdewebdev.sdsc.edu/geneid/geneid_proc_selcol_GET.php?";
baseurl = "https://bdcw.org/geneid/geneid_proc_selcol_GET.php?";
species = "hsa";
GeneList = ["AIM1", "VWF"]; # several gene, use __ e.g., AIM1__KLF2__E2F1 (since REFSEQ has _ in it)
GeneIDType = "SYMBOL_OR_ALIAS";

# another example
GeneList = ["NM_002156", "NM_199440"]; 
GeneIDType = "REFSEQ";

GeneListStr = GeneListSplitChar.join(GeneList); # "AIM1__VWF"; # if several gene, use __ e.g., AIM1__KLF2__E2F1 (since REFSEQ has _ in it)

# Example complete URL
#php_url = "https://sc-cfdewebdev.sdsc.edu/geneid/geneid_proc_selcol_GET.php?species=hsa&GeneListStr=AIM1__KLF4&GeneIDType=SYMBOL_OR_ALIAS";

php_url = baseurl + "species=" + species + "&" + "GeneListStr=" + GeneListStr + "&" + "GeneIDType=" + GeneIDType;


def fetch_page_from_url(url_str):
    debug = 0;
    resp = requests.get(url_str);

    #http_respone 200 means OK status
    if(resp.status_code==200):
        if(debug):
            print("Successfully opened the web page")
            print("The contents are as follow :-\n")
      
        # we need a parser,Python built-in HTML parser is enough
        soup=BeautifulSoup(resp.text,'html.parser');
        if(debug):
            print("Using html.parser:-\n");
            print(soup); # print(soup.prettify())

        if(debug):
            print("Printing object named soup")
            print(soup); # print(soup.prettify())
            print("Printing type(soup)")
            print(type(soup));
            print("Printing object named list(soup.children)")
            print(list(soup.children))
            print("Printing elements of list(soup.children)")
            [print(item) for item in list(soup.children)]

        #html = list(soup.children)[0]; # index depends on actual content
        html = list(soup.children)[1]; # updated 2022/08/22: index depends on actual content
        
        if(debug):
            print("Printing object named html");
            print(html);

        body = html.find_all('body')[0];
        body_text = body.get_text();

        if(debug):
            print(body);
            print(body_text);

        #body_text_lines1 = body_text.split('\n');
        # https://pythonexamples.org/python-split-string-by-new-line/
        body_text_lines = re.split('\n+', body_text);
        # remove empty lines
        body_text_lines = ' '.join(body_text_lines).split();
        
        # get 1st one: if needed can search: https://www.delftstack.com/howto/python/grep-python/
        oneline = body_text_lines[0];
        
        #print("Using html5lib:-\n"); soup = BeautifulSoup(resp.content, 'html5lib'); print(soup.prettify());
  
        return oneline;
    else:
        print("Error: Code: ".str(resp.statud_code))

print("Going to fetch php/html data and extract the output file url")
file_url = fetch_page_from_url(php_url);
print("Going to fetch php/html ... Done")
# read using pandas
print("Going to fetch table using the output file url")
df = pd.read_csv(file_url, sep="\t")
df_cols = list(df.columns);
print("Converted Gene IDs are in the variable df (a pandas dataframe), see also df_cols")
# df[['SYMBOL', 'REFSEQ', 'UNIPROT']]

# a single column as a list:
# df['SYMBOL'].tolist()
# unique elements of a list: via np.array and np.unique
# list(np.unique(np.array(df['SYMBOL'].tolist())))
# list(np.unique(np.array(df['UNIPROT'].tolist())))