Source code for BioVirusHost

import pandas as pd
from pandas import DataFrame
from Bio import Entrez
import urllib

# must install BeautifulSoup4 for pandas.read.html

main_url = "https://www.genome.jp/virushostdb/"

[docs]def v_search(v_query):
    """

    Search for virus hosts by virus scientific name

    Args:

        v_query (list): list of virus(es) scientific name

    Returns:

        A generator object for Data frame(s) contains ( virus (species) name, virus lineage, host name, and host lineage )

    Raises:
        TypeError if argument (v_quer) is not a list

    """
    #rais error when the arg. is not a list
    if type (v_query) != list:
        raise TypeError ("v_search takes list as an argument")

    for i_virus in v_query:
        i_virus = i_virus.strip()
        i_virus = i_virus.capitalize()
      
        # creat the url
        sub_url = main_url + "view/?"
        dict_url = {"virus_scientific_name" : i_virus}
        api_request =  urllib.parse.urlencode(dict_url)
        api_request = sub_url + api_request

        #get the table results
        tables = pd.read_html(api_request)

        tables = tables[1]

        #correct the order

        tables = tables.iloc[::-1]

        #rest the index
        tables.reset_index(inplace=True, drop=True)

                #get the total result to view all of them 
        total = tables[1][1].split(" ")
        total = total[1]

        #reload the url to view all the result

        if len( set(tables[0]) ) - 2 != int(total):
            api_request = api_request + "&per_page=" + str(total)
            tables = pd.read_html(api_request)
            
            #get the table results
            tables = tables[1]

            #correct the order
            tables = tables.iloc[::-1]

            #rest the index
            tables.reset_index(inplace=True, drop=True)
        else:
            pass

        #drop unnecessarily row
        tables.drop([1],inplace = True,axis = 0)
        tables.drop([4,5,6],inplace = True,axis = 1)

        #firt row as header
        tables.reset_index(inplace=True, drop=True)
        tables.rename(columns=tables.iloc[0],inplace=True)
        tables.drop([0],inplace = True,axis = 0)

        #rest the index again
        tables.reset_index(inplace=True, drop=True)

        if len(tables) == 0:
            yield (f"No result found for {i_virus}")
        else:
            yield tables

[docs]def h_search(h_query):
    """

    Search for virus hosts by host scientific name

    Args:

        v_query (list): list of host(s) scientific name

    Returns:

        A generator object for Data frame(s) contains ( virus (species) name, virus lineage, host name, and host lineage )

    Raises:

        TypeError if argument (h_query) is not a list

      
    """
    #rais error when the arg. is not a list
    if type (h_query) != list:
        raise TypeError ("h_search takes list as an argument")

    for i_host in h_query:
        i_host = i_host.strip()
        i_host = i_host.capitalize()
      
        # creat the url
        sub_url = main_url + "view/?"
        dict_url = {"host_scientific_name" : i_host}
        api_request =  urllib.parse.urlencode(dict_url)
        api_request = sub_url + api_request

        #get the table results
        tables = pd.read_html(api_request)

        tables = tables[1]

        #correct the order
        tables = tables.iloc[::-1]

        #rest the index
        tables.reset_index(inplace=True, drop=True)

        #get the total result to view all of them 
        total = tables[1][1].split(" ")
        total = total[1]

        #reload the url to view all the result

        if len( set(tables[0]) ) - 2 != int(total):
            api_request = api_request + "&per_page=" + str(total)
            tables = pd.read_html(api_request)

            #get the table results
            tables = tables[1]

            #correct the order
            tables = tables.iloc[::-1]

            #rest the index
            tables.reset_index(inplace=True, drop=True)
        else:
            pass


        #drop unnecessarily row
        tables.drop([1],inplace = True,axis = 0)
        
        tables.drop([4,5,6],inplace = True,axis = 1)

        #firt row as header
        tables.reset_index(inplace=True, drop=True)
        tables.rename(columns=tables.iloc[0],inplace=True)
        tables.drop([0],inplace = True,axis = 0)


        #drop rows with no ( host name ) in host lineage 
        #get the fist name of the host name if it two parts
        host_first_name = i_host.split(" ")
        host_first_name = host_first_name[0]
        tables = tables[ tables["Host name"].str.contains(i_host)  | tables["Host lineage"].str.contains(i_host) |  tables["Host lineage"].str.contains(host_first_name) ] 

        #rest the index again
        tables.reset_index(inplace=True, drop=True)

        if len(tables) == 0:
            yield (f"No result found for {i_host}")
        else:
            yield tables


 
[docs]def v_tax_search(tax_query):
    """

    Search for virus hosts by virus TAX id

    Args:

        tax_query (list): list of virus(s) TAX id

    Returns:

        A generator object for Data frame(s) contains ( virus (species) name, virus lineage, host name, and host lineage )

    Raises:

        TypeError if argument (tax_query) is not a list

      
    """
    #rais error when the arg. is not a list
    if type (tax_query) != list:
        raise TypeError ("h_search takes list as an argument")

    all_virus_name = []
    for tax_id in tax_query:
        tax_id = str(tax_id)
        tax_id = tax_id.strip()
      
        # creat the url
        api_request = main_url + tax_id
       
        #get the table results
        tables = pd.read_html(api_request)
        tables = tables[2]

        #get the virus name
        all_virus_name.append(tables[0][0])

    return v_search(all_virus_name)


[docs]def h_tax_search(tax_query):
    """

    Search for virus hosts by host tax id

    Args:

        tax_query (list): list of host(s) tax id

    Returns:

        A generator object for Data frame(s) contains ( virus (species) name, virus lineage, host name, and host lineage )

    Raises:

        TypeError if argument (tax_query) is not a list

    """
    #rais error when the arg. is not a list
    if type (tax_query) != list:
        raise TypeError ("h_search takes list as an argument")

    all_host_name = []
    for tax_id in tax_query:
        tax_id = str(tax_id)
        tax_id = tax_id.strip()
      
        # creat the url
        api_request = main_url + tax_id
       
        #get the table results
        tables = pd.read_html(api_request)
        tables = tables[2]

        #get the virus name
        all_host_name.append(tables[0][0])

    return h_search(all_host_name)


[docs]def comp_query(host, virus_lineage):
    """

    Advanced search for virus hosts using virus lineage and a specific host

    Args:

        host (str): target host name or tax id

        virus_lineage (str): virus lineage

    Returns:

        A generator object for Data frame(s) contains ( virus (species) name, virus lineage, host name, and host lineage )

    Raises:

        TypeError if argument (host) is not a str

        TypeError if argument (virus_lineage) is not a str
      
    """
    #rais error when the arg. is not a list
    if type (host) != str or type(virus_lineage) != str:
        raise TypeError ("comp_query takes arguments as str")

    # creat the url
    host = host.strip()
    host = host.capitalize()
    virus_lineage = virus_lineage.strip()
    virus_lineage = virus_lineage.capitalize()
    

    sub_url = main_url + "view/?"

    dict_url = {"host_scientific_name" : host, "virus_lineage" : virus_lineage }
    api_request =  urllib.parse.urlencode(dict_url)
    api_request = sub_url + api_request

    #get the table results
    tables = pd.read_html(api_request)

    tables = tables[1]

    #correct the order
    tables = tables.iloc[::-1]

    #rest the index
    tables.reset_index(inplace=True, drop=True)

    #get the total result to view all of them 
    total = tables[1][1].split(" ")
    total = total[1]
    
     #reload the url to view all the result

    if len( set(tables[0]) ) - 2 != int(total):
        api_request = api_request + "&per_page=" + str(total)
        tables = pd.read_html(api_request)

        #get the table results
        tables = tables[1]

        #correct the order
        tables = tables.iloc[::-1]

        #rest the index
        tables.reset_index(inplace=True, drop=True)
    else:
        pass


    #drop unnecessarily row
    tables.drop([1],inplace = True,axis = 0)
    tables.drop([4,5,6],inplace = True,axis = 1)

    #firt row as header
    tables.reset_index(inplace=True, drop=True)
    tables.rename(columns=tables.iloc[0],inplace=True)
    tables.drop([0],inplace = True,axis = 0)


    #drop rows with no ( host name ) in host lineage 
    #get the fist name of the host name if it two parts
    host_first_name = host.split(" ")
    host_first_name = host_first_name[0]
    tables = tables[ tables["Host name"].str.contains(host)  | tables["Host lineage"].str.contains(host) |  tables["Host lineage"].str.contains(host_first_name) ] 
    
    #rest the index again
    tables.reset_index(inplace=True, drop=True)

    if len(tables) == 0:
        yield (f"No result found for {host} and {virus_lineage}")
    else:
        yield tables

[docs]def more_info(v_query, email, host_info= False):

    """
    Get more information about the virus or its host

    Args:

        v_query (str or int): target virus name or tax id 

        email (str): your email address, required by the NCBI server

        host_info (bool): default = False

    Returns:

        A dataframe contains information about a virus (only) as genome type, if host_info = False (default) 

        A generator object for Data frame(s) contains infromation about hosts (only), if host_info = True 

    Raises:

        TypeError if argument (v_query) is not a str

        TypeError if argument (email) is not a str
      
    """
    if type(v_query) != str:
        raise TypeError ("more_info takes v_query argument as str")

    if type(email) != str:
        raise TypeError ("more_info takes email argument as str")

    #see of it tax id or name
    try:
        v_query = int(v_query) #if true so it is tax id
        tax_id = str (v_query)
    except:
        v_query = v_query.strip()
        v_query = v_query.capitalize()
        #get the tax id from the virus name
        Entrez.email = email # Always tell NCBI who you are
        handle = Entrez.esearch(db="taxonomy",term = v_query )
        record = Entrez.read(handle)
        name = record["TranslationStack"]
        name = name[0]["Term"]
        name = name[:name.find("[All Names]")] 
        if name.strip() == v_query:
            tax_id = str ( (record["IdList"][0]) )

    #creat the url 
    api_request = main_url + str(tax_id)

    tables = pd.read_html(api_request)


    if host_info == False:
       yield tables[3] #the virus dataframe

    elif host_info == True:
        for i in range(len(tables))[4:]:
            yield tables[i]