This notebook code is from the app found here: https://github.com/kenichinakanishi/houseplant_classifier/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

req = Request('https://www.aspca.org/pet-care/animal-poison-control/cats-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Soupify the webpage
soup = BeautifulSoup(webpage, 'lxml')       
# Search through the parse tree to get all the content from the table         
content_list = soup.find_all('span')[7:-4]       
# Put it in a dataframe for further processing
df_cats = pd.DataFrame(content_list)

/home/gao/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py:305: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  values = np.array([convert(v) for v in values])

# Clean up the strings
df_cats[0] = df_cats[0].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[4] = df_cats[4].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[1] = df_cats[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_cats = df_cats.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_cats['Toxic to Cats'] = True
first_nontoxic_cats = [index for index in df_cats[df_cats['Name'].str.startswith('A')].index if index>100][0]
df_cats.loc[first_nontoxic_cats:,'Toxic to Cats'] = False

df_cats

req = Request('https://www.aspca.org/pet-care/animal-poison-control/dogs-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'lxml')                 # soupify the webpage
content_list = soup.find_all('span')[7:-4]            # Get all the content from the table
df_dogs = pd.DataFrame(content_list)                  # Put it in a dataframe for processing

/home/gao/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py:305: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  values = np.array([convert(v) for v in values])

# Clean up the strings
df_dogs[0] = df_dogs[0].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[4] = df_dogs[4].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[1] = df_dogs[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_dogs = df_dogs.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_dogs['Toxic to Dogs'] = True
first_nontoxic_dogs = [index for index in df_dogs[df_dogs['Name'].str.startswith('A')].index if index>100][0]
df_dogs.loc[first_nontoxic_dogs:,'Toxic to Dogs'] = False

# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)

# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)

aspca_df.sample(10)

aspca_df = aspca_df.drop_duplicates('Scientific Name') # Get rid of duplicates
aspca_df = aspca_df.reset_index(drop=True).sort_index()   # Reset and sort index

aspca_df = aspca_df.drop(aspca_df[aspca_df['Scientific Name'].isin(['','NONE LISTED'])].index,axis=0).reset_index(drop=True).sort_index()    # Fix mistakes in database

# Ensure proper punctuation for each scientific name.
def normalize_capitalization(x):
  first_word, rest = x.split()[0], x.split()[1:]
  first_word = [first_word.capitalize()]
  rest = [word.lower() for word in rest]
  return ' '.join(first_word+rest)

# Clean up repeated species that have different names
def species_normalizer(word):
  if word.split()[-1] in ['sp','species','spp','sp.','spp.']:
    word = ''.join(word.split()[:-1])
  return word

# Remove cv from names, as it is an outdated way of referring to cultivars
def cv_remover(word):
  if 'cv' in word:
    word = word.replace(' cv ',' ')
  return word

# Remove var. from names
def var_remover(word):
  if 'var' in word:
    word = word.replace(' var. ',' ')
  return word

# Apply each of the functions
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(normalize_capitalization)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(species_normalizer)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(cv_remover)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(var_remover)

# Remove special characters
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(lambda x: ''.join([character for character in x if character.isalnum() or character.isspace()]))

# Reset dataframe for further processing
aspca_df = aspca_df.sort_values('Scientific Name').drop_duplicates('Scientific Name')
aspca_df = aspca_df.reset_index(drop=True).sort_index()

aspca_df.sample(10)

use_cols = ['scientificName','taxonRank','family','genus','taxonomicStatus','taxonID', 'acceptedNameUsageID']
wfo_df = pd.read_csv('../classification.txt', sep='\t', lineterminator='\n', usecols=use_cols)
wfo_df = wfo_df.sort_values('taxonomicStatus')

/home/gao/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

wfo_df.sample(10)

# Don't need this column, we trust the WFO database more
aspca_df.drop('Family', axis=1, inplace=True)
# Merge dataframes together to get trusted info
aspca_df = aspca_df.merge(wfo_df, how = 'left', left_on = ['Scientific Name'], right_on = ['scientificName'])
# Sort by taxonomicStatus and drop duplicates keeping the first - keeping accepted names as priority
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
# Fill NaN's with Unknown
aspca_df = aspca_df.fillna('Unknown')

# Clean up and deal with scientific names that are unknown, due to misspellings or otherwise.
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(len(unknown_idx))

101

def get_closest_name(unknown_name, name_df = wfo_df, name_col = 'scientificName', threshold=0.9, verbose=False):
  """ Matches an 'unknown_name' against accepted names in a 'name_df'. Will return names that are above a 'threshold' of closeness. 

  Parameters
  ----------
  unknown_name: str
    Name we want to match against accepted names. 
  name_df: DataFrame
    DataFrame containing accepted names.
  name_col: str, name of name_df column 
    DataFrame column containing accepted names.
  threshold: int
    How closely does the unknown_name need to match with the accepted name.
    If above this threshold, the name is added to a dictionary of possible names.
  verbose: bool
    Should the function print the entire list of possible names. 

  Returns:
  ----------
  str
    Closest name to 'unknown_name' that was above the given 'threshold'.
  """
  import operator
  from difflib import SequenceMatcher
  def similar(a, b):
      return SequenceMatcher(None, a, b).ratio()
  poss_names = {}
  # Only look through entries with the same first letter to save time
  for true_sciname in name_df[name_df[name_col].str.startswith(unknown_name[0])][name_col].values:
    similar_score = similar(unknown_name, true_sciname)
    if similar_score>threshold:
      poss_names[true_sciname]=similar_score
  # If the dict is empty
  if verbose == True:
    print(poss_names)
  if not bool(poss_names):
    print(f'No names close enough to {unknown_name}.')
    return ''
  else:
    print(f'{unknown_name} is closest to {max(poss_names.items(), key=operator.itemgetter(1))[0]}, with a score of {max(poss_names.items(), key=operator.itemgetter(1))[1]:.2f}')
    return max(poss_names.items(), key=operator.itemgetter(1))[0]

def fix_name(unknown_name, true_name):
  """ Fixes the aspca_df entries according to the accepted wfo_df entry.

  Parameters
  ----------
  unknown_name: str
    Name we want to fix. 
  true_name: DataFrame
    Accepted name to use.
  """
  # Get the series we're looking to change
  unknown_data = aspca_df[aspca_df['Scientific Name'] == unknown_name]
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['scientificName'] == true_name]
  true_sciname = true_data.loc[:,'scientificName'].values[0]
  true_family = true_data.loc[:,'family'].values[0]
  true_genus = true_data.loc[:,'genus'].values[0]
  true_taxonomicStatus = true_data.loc[:,'taxonomicStatus'].values[0]
  # Change scientific name, family, genus and taxonomic status to accepted versions
  aspca_df.iloc[unknown_data.index,2] = true_sciname
  aspca_df.iloc[unknown_data.index,8] = true_family
  aspca_df.iloc[unknown_data.index,9] = true_genus
  aspca_df.iloc[unknown_data.index,10] = true_taxonomicStatus

unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(f'{len(unknown_idx)} plants currently cannot be matched.')
from tqdm.notebook import tqdm
for i in tqdm(unknown_idx):
  unknown_name = aspca_df.iloc[i,2]
  closest_name = get_closest_name(unknown_name)
  if closest_name == '':
    continue
  fix_name(unknown_name,closest_name)

101 plants currently cannot be matched.
Malus sylvestrus is closest to Malus sylvestris, with a score of 0.94
No names close enough to Maranta insignis.
No names close enough to Miltonia roezlii alba.
No names close enough to Neoregalia.
No names close enough to Nephrolepis exalta bostoniensis.
Nephrolepsis exalta is closest to Nephrolepis exaltata, with a score of 0.92
No names close enough to Nephrolepsis cordifolia duffii.
No names close enough to Lilium orientalis.
No names close enough to Nephrolepsis cordifolia plumosa.
Nephrolepis exalta is closest to Nephrolepis exaltata, with a score of 0.95
No names close enough to Lilium asiatica.
Hosta plataginea is closest to Hosta plantaginea, with a score of 0.97
No names close enough to Lampranthus piquet.
Kalmia poliifolia is closest to Kalmia polifolia, with a score of 0.97
Kalmia augustifolia is closest to Kalmia angustifolia, with a score of 0.95
Jasminium is closest to Jasminum, with a score of 0.94
Hoya publcalyx is closest to Hoya pubicalyx, with a score of 0.93
No names close enough to Hoya carnosa krinkle kurl.
No names close enough to Hemigraphis exotica.
Gynura aurantica is closest to Gynura aurantiaca, with a score of 0.97
No names close enough to Nolina tuberculata.
Guzmania lingulata minor is closest to Guzmania lingulata var. minor, with a score of 0.91
Lavendula angustifolia is closest to Lavandula angustifolia, with a score of 0.95
Onychium japonica is closest to Onychium japonicum, with a score of 0.91
No names close enough to Schefflera or brassia actinoplylla.
Paeonis officinalis is closest to Paeonia officinalis, with a score of 0.95
No names close enough to Giant dracaena.
Taxus canadensus is closest to Taxus canadensis, with a score of 0.94
Stapelia hirsata is closest to Stapelia hirsuta, with a score of 0.94
Sorghum vulgare var sudanesis is closest to Sorghum vulgare var. sudanense, with a score of 0.92
Smilax walteria is closest to Smilax walteri, with a score of 0.97
Secum weinbergii is closest to Sedum weinbergii, with a score of 0.94
No names close enough to Scindapsusphilodendron.
Santpaulia confusa is closest to Saintpaulia confusa, with a score of 0.97
Rhipsalis cassutha is closest to Rhipsalis cassytha, with a score of 0.94
Rheum rhabarbarium is closest to Rheum rhabarbarum, with a score of 0.97
Origanum vulgare hirtum is closest to Origanum vulgare var. hirtum, with a score of 0.90
Tolmeia menziesii is closest to Tolmiea menziesii, with a score of 0.94
Podocarpus macrophylla is closest to Podocarpus macrophyllus, with a score of 0.93
Ploystichum munitum is closest to Polystichum munitum, with a score of 0.95
Plectranthus oetendahlii is closest to Plectranthus oertendahlii, with a score of 0.98
Plantanus occidentalis is closest to Platanus occidentalis, with a score of 0.98
Pilea cadieri is closest to Pilea cadierei, with a score of 0.96
No names close enough to Phoenix robellinii.
No names close enough to Peperomia serpens variegata.
Peperomia prostata is closest to Peperomia prostrata, with a score of 0.97
Peperomia griseoargentea is closest to Peperomia griseoargentia, with a score of 0.96
Pellonia pulchra is closest to Pellionia pulchra, with a score of 0.97
Rhapis flabelliformus is closest to Rhapis flabelliformis, with a score of 0.95
Fuschsia is closest to Fuchsia, with a score of 0.93
No names close enough to Begonia rex peace.
Eriogonium umbellatum is closest to Eriogonum umbellatum, with a score of 0.98
Citrus aurantifolia is closest to Citrus aurantiifolia, with a score of 0.97
Cissus dicolor is closest to Cissus discolor, with a score of 0.97
Chlorophytum bichetti is closest to Chlorophytum bichetii, with a score of 0.95
No names close enough to Ceratostigma larpentiae.
Cattleya trianaei is closest to Cattleya trianae, with a score of 0.97
Camellia japonica thea japonica is closest to Camellia japonica var. japonica, with a score of 0.90
Caesalpinia gilliessi is closest to Caesalpinia gilliesii, with a score of 0.95
Borage officinalis is closest to Borago officinalis, with a score of 0.94
No names close enough to Bertolonia mosaica.
No names close enough to Begonia semperflorens cultivar.
Begonia scharfii is closest to Begonia scharffii, with a score of 0.97
Begonia cleopatra is closest to Begonia cleopatrae, with a score of 0.97
No names close enough to Asparagus densiflorus sprengeri.
Arum palestinum is closest to Arum palaestinum, with a score of 0.97
Anthurium scherzeranum is closest to Anthurium scherzerianum, with a score of 0.98
Anthirrhinum multiflorum is closest to Antirrhinum multiflorum, with a score of 0.98
Anoectuchilus setaceus is closest to Anoectochilus setaceus, with a score of 0.95
Anethum graveolena is closest to Anethum graveolens, with a score of 0.94
No names close enough to Albiflora.
No names close enough to Acantha.
Tradescantia flumeninsis is closest to Tradescantia fluminensis, with a score of 0.92
Citrus aurantium is closest to Citrus ×aurantium, with a score of 0.97
Euonymus atropurpurea is closest to Euonymus atropurpureus, with a score of 0.93
Citrus limonia is closest to Citrus ×limonia, with a score of 0.97
Cleome hasserlana is closest to Cleome hassleriana, with a score of 0.91
Eriogonium inflatum is closest to Eriogonum inflatum, with a score of 0.97
No names close enough to Episcia cultivar.
Epidendrum atropurpeum is closest to Epidendrum atropurpureum, with a score of 0.96
Eleagnus is closest to Elaeagnus, with a score of 0.94
No names close enough to Echeveria puloliver.
Echeveria pulinata is closest to Echeveria pulvinata, with a score of 0.97
No names close enough to Echevaria.
No names close enough to Dypsis lutescens chrysalidocarpus lutescens alternate scientific name.
No names close enough to Draceana.
No names close enough to Daucus carota sativa.
Citrus paradisii is closest to Citrus paradisi, with a score of 0.97
No names close enough to Cycasrevolutazamia.
No names close enough to Cucurbita maxima turbaniformis.
No names close enough to Cucurbita maxima hubbard.
No names close enough to Cucurbita maxima butternut.
No names close enough to Cucurbita maxima buttercup.
No names close enough to Cucurbita maxima banana.
No names close enough to Cucurbia pepo zucchini.
No names close enough to Cryptanthus bivattus minor.
Coleus ampoinicus is closest to Coleus amboinicus, with a score of 0.94
Clivia minata is closest to Clivia miniata, with a score of 0.96
Clintonia umbelluata is closest to Clintonia umbellulata, with a score of 0.98
No names close enough to Cycasandzamia.
Veitchia merillii is closest to Veitchia merrillii, with a score of 0.97

# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)

52

# Manually fix some scientific names that don't match anything on record automatically
fix_name('Nephrolepsis cordifolia plumosa', 'Nephrolepis cordifolia')
fix_name('Nephrolepsis cordifolia duffii', 'Nephrolepis cordifolia')
fix_name('Nephrolepis exalta bostoniensis', 'Nephrolepis exaltata')
fix_name('Neoregalia', 'Neoregelia')
fix_name('Miltonia roezlii alba', 'Miltonia roezlii')
fix_name('Maranta insignis', 'Calathea insignis')
fix_name('Lilium orientalis', 'Lilium japonicum')
fix_name('Lampranthus piquet', 'Lampranthus piquetbergensis')
fix_name('Hoya carnosa krinkle kurl', 'Hoya carnosa')
fix_name('Hemigraphis exotica', 'Hemigraphis alternata')
fix_name('Lilium asiatica', 'Lilium japonicum')
fix_name('Nolina tuberculata', 'Beaucarnea recurvata')
fix_name('Giant dracaena', 'Cordyline australis')
fix_name('Scindapsusphilodendron', 'Philodendron scandens')
fix_name('Schefflera or brassia actinoplylla', 'Schefflera actinophylla')
fix_name('Phoenix robellinii', 'Phoenix roebelenii')
fix_name('Peperomia serpens variegata', 'Peperomia serpens')
fix_name('Bertolonia mosaica', 'Fittonia albivenis')
fix_name('Begonia semperflorens cultivar', 'Begonia semperflorens')
fix_name('Begonia rex peace', 'Begonia rex')
fix_name('Asparagus densiflorus sprengeri', 'Asparagus densiflorus')
fix_name('Albiflora', 'Tradescantia zebrina')
fix_name('Acantha', 'Acanthus')
fix_name('Episcia cultivar', 'Episcia')
fix_name('Echevaria', 'Echeveria')
fix_name('Echeveria puloliver', 'Echeveria harmsii')
fix_name('Dypsis lutescens chrysalidocarpus lutescens alternate scientific name', 'Dypsis lutescens')
fix_name('Draceana', 'Dracaena')
fix_name('Daucus carota sativa', 'Daucus carota')
fix_name('Ceratostigma larpentiae', 'Ceratostigma plumbaginoides')
fix_name('Cycasrevolutazamia', 'Cycas revoluta')
fix_name('Cucurbita maxima turbaniformis', 'Cucurbita maxima')
fix_name('Cucurbita maxima hubbard', 'Cucurbita maxima')
fix_name('Cucurbita maxima butternut', 'Cucurbita maxima')
fix_name('Cucurbita maxima banana', 'Cucurbita maxima')
fix_name('Cucurbita maxima buttercup', 'Cucurbita maxima')
fix_name('Cucurbia pepo zucchini', 'Cucurbita pepo')
fix_name('Cryptanthus bivattus minor', 'Cryptanthus bivittatus')
fix_name('Cycasandzamia', 'Cycas')

# Manually match up synonyms that don't have a database link to the accepted name
fix_name('Chlorophytum bichetii', 'Chlorophytum laxum')
fix_name('Rhapis flabelliformis', 'Rhapis excelsa')
fix_name('Cleome hassleriana', 'Cleome spinosa')
fix_name('Pellionia pulchra', 'Pellionia repens')
fix_name('Cissus discolor', 'Cissus javana')
fix_name('Miltonia roezlii', 'Miltoniopsis roezlii')
fix_name('Sorghum vulgare var. sudanense', 'Sorghum bicolor')
fix_name('Camellia japonica var. japonica', 'Camellia japonica')
fix_name('Onychium japonicum', 'Onychium japonicum')
fix_name('Epidendrum atropurpureum', 'Psychilis atropurpurea')
fix_name('Philodendron scandens', 'Philodendron hederaceum')
fix_name('Origanum vulgare var. hirtum', 'Origanum vulgare subsp. hirtum')
fix_name('Guzmania lingulata var. minor', 'Guzmania lingulata var. concolor')
fix_name('Lavandula angustifolia', 'Lavandula angustifolia')
fix_name('Begonia semperflorens', 'Begonia cucullata')
fix_name('Calathea insignis', 'Calathea crotalifera')
fix_name('Citrus ×limonia', 'Citrus limon')
fix_name('Coleus amboinicus', 'Plectranthus amboinicus')
fix_name('Rhipsalis cassytha', 'Rhipsalis dichotoma')
fix_name('Lycopersicon', 'Solanum lycopersicum')
fix_name('Lachenalia lilacina', 'Iris domestica')
fix_name('Cymopterus watsonii', 'Cymopterus terebinthinus')

# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)

1

synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')

71 entries have a more acceptable synonym

# Work to update the remaining scientific names that are synonyms for their accepted scientific names
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index

for i in synonym_idx:
  # Get the series we're looking to change
  synonym_data = aspca_df.iloc[i,:]
  synonym_name = synonym_data.loc['Scientific Name']
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
  true_sciname = true_data.iloc[:,1].values[0]
  fix_name(synonym_name,true_sciname)

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-36-d42f5603c8fc> in <module>
      5   # Grab accepted data from wfo database based on ID lookup
      6   true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
----> 7   true_sciname = true_data.iloc[:,1].values[0]
      8   fix_name(synonym_name,true_sciname)

IndexError: index 0 is out of bounds for axis 0 with size 0

synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')

31 entries have a more acceptable synonym

# Sort and drop again
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first')
aspca_df = aspca_df.sort_values('Scientific Name').reset_index(drop=True).sort_index()
# Set genus of one-word names to be the name, rather than NaN
aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'genus'] = aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'Scientific Name']
# Drop columns we no longer need
aspca_df = aspca_df.drop(['taxonID', 'scientificName', 'taxonomicStatus', 'acceptedNameUsageID', 'taxonRank'], axis=1)
# Standardize column names
aspca_df.rename(columns = {'genus':'Genus', 'family':'Family'}, inplace=True)
# Reorder columns
cols = ['Name', 'Scientific Name', 'Genus', 'Family', 'Alternative Names', 'Toxic to Dogs', 'Toxic to Cats']
aspca_df = aspca_df[cols]

aspca_df.to_csv('Plant Toxicity - v6.csv')
aspca_df.sample(10)

aspca_df.head()

aspca_df[aspca_df['Toxic to Dogs'] != aspca_df['Toxic to Cats']]

aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[70:80]

# How many Families have mixed toxicity
len(aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])

33

# How many Families
len(aspca_df['Family'].unique())

111

aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[208:218]

# How many Genuses have mixed toxicity
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])

9

# How many Genuses
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs'))

346

# If running in Colabs
!pip install selenium -q
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver -q
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/home/gao/anaconda3/bin/python -m pip install --upgrade pip' command.
Reading package lists... Done
E: Could not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)
E: Unable to lock directory /var/lib/apt/lists/
W: Problem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)
W: Problem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
cp: cannot stat '/usr/lib/chromium-browser/chromedriver': No such file or directory

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# Import and setup the Selenium webdriver
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

	Name	Alternative Names	Scientific Name	Family	Toxic to Cats
0	Adam-and-Eve	Arum, Lord-and-Ladies, Wake Robin, Starch Root...	Arum maculatum	Araceae	True
1	African Wonder Tree		Ricinus communis		True
2	Alocasia	Elephant's Ear	Alocasia spp.	Araceae	True
3	Aloe		Aloe vera	Liliaceae	True
4	Amaryllis	Many, including: Belladonna lily, Saint Joseph...	Amaryllis spp.	Amaryllidaceae	True
...	...	...	...	...	...
980	Yellowrocket		Barbarea vulgaris	Brassicaceae	False
981	Yorba Linda		Peperomia rotundifolia	Piperaceae	False
982	Zebra Haworthia		Haworthia fasciata	Liliaceae	False
983	Zinnia		Zinnia species	Asteraceae	False
984	Zucchini Squash		Cucurbia pepo cv zucchini	Cucurbitaceae	False

	Name	Alternative Names	Scientific Name	Family	Toxic to Dogs	Toxic to Cats
810	Pink Splash	Flamingo Plant, Polka Dot Plant, Measles Plant...	Hypoestes phyllostachya	Acanthaceae	False	False
120	English Ivy	Branching Ivy, Glacier Ivy, Needlepoint Ivy, S...	Hedera helix	Araliaceae	True	True
564	Crape Myrtle	Crepe Myrtle	Lagerstroemia indica	Lythraceae	False	False
201	Japanese Yew	English Yew, Western Yew, Pacific Yew, Anglo-J...	Taxus sp.	Taxaceae	True	True
635	Giant Touch-Me-Not	Buzzy Lizzie, Impatience Plant, Patient Lucy, ...	Impatiens spp.	Balsaminaceae	False	False
92	Cowbane	Water Hemlock, Poison Parsnip	Cicuta species	Apiaceae	True	True
277	Ornamental Pepper	Natal Cherry, Winter Cherry, Jerusalem Cherry	Solanum pseudocapsicum	Solanaceae	True	True
513	Carrot Fern		Onychium japonica	Polypodiaceae	False	False
712	Leather Peperomia		Peperomia crassifolia	Piperaceae	False	False
493	California Pitcher Plant	Cobra Orchid, Cobra Plant, Cobra Lily, Chrysam...	Darlingtonia californica	Sarraceniaceae	False	False

	Name	Alternative Names	Scientific Name	Family	Toxic to Dogs	Toxic to Cats
108	American Bittersweet	Bittersweet, Waxwork, Shrubby Bittersweet, Fal...	Celastrus scandens	Celastraceae	True	True
530	Pacific Yew	English Yew, Western Yew, Japanese Yew, Anglo-...	Taxus brevifolia	Taxaceae	True	True
467	Pie Plant	Rhubarb	Rheum rhabarbarium	Polygonaceae	True	True
164	Pheasant Plant	Zebra Plant	Cryptanthus zonatus	Bromeliaceae	False	False
452	Primrose		Primula vulgaris	Primulaceae	True	True
506	Jackson Brier		Smilax lanceolata	Liliaceae	False	False
407	Ivy Peperomia	Plantinum Peperomia, Silver leaf Peperomia, Iv...	Peperomia griseoargentea	Piperaceae	False	False
147	Poison Hemlock	Poison Parsley, Spotted Hemlock, Winter Fern, ...	Conium maculatum	Umbelliferae	True	True
351	Cardinal Flower	Lobelia, Indian Pink	Lobelia cardinalis	Campanulaceae	True	True
236	Pink Brocade		Episcia cultivar	Gesneriaceae	False	False

	taxonID	scientificName	taxonRank	family	genus	taxonomicStatus	acceptedNameUsageID
796160	wfo-0000798814	Peridium oblongifolium	SPECIES	Peraceae	Peridium	Synonym	wfo-0000267144
180708	wfo-0000180970	Cracca smallii	SPECIES	Fabaceae	Cracca	Synonym	wfo-0000178756
911945	wfo-0000914633	Thinopyrum turcicum	SPECIES	Poaceae	Thinopyrum	Synonym	wfo-0000866236
167159	wfo-0000167369	Indigofera cinerea	SPECIES	Fabaceae	Indigofera	Synonym	wfo-0000173646
642316	wfo-0000644639	Diaphanoptera khorasanica	SPECIES	Caryophyllaceae	Diaphanoptera	Accepted	NaN
464965	wfo-0000466716	Phyllocyclus minutiflorus	SPECIES	Gentianaceae	Phyllocyclus	Doubtful	NaN
740337	wfo-0000742945	Daphne pseudomezereum var. koreana	VARIETY	Thymelaeaceae	Daphne	Synonym	wfo-0000637684
868404	wfo-0000871073	Festuca montis-aurei	SPECIES	Poaceae	Festuca	Synonym	wfo-0000869683
186218	wfo-0000186502	Lotononis curvicarpa	SPECIES	Fabaceae	Lotononis	Accepted	NaN
552490	wfo-0000554468	Specklinia casualis	SPECIES	Orchidaceae	Specklinia	Synonym	wfo-0000339564

	Name	Scientific Name	Genus	Family	Alternative Names	Toxic to Dogs	Toxic to Cats
102	Celosia Globosa	Celosia globosa	Celosia	Amaranthaceae	Globe Amarantha, Perpetua	False	False
18	Alocasia	Alocasia	Alocasia	Araceae	Elephant's Ear	True	True
386	Variegated Philodendron	Philodendron hederaceum	Philodendron	Araceae		True	True
411	American Mandrake	Podophyllum peltatum	Podophyllum	Berberidaceae	Mayapple, Indian Apple Root, Umbrella Leaf, Wi...	True	True
94	Chestnut	Castanea dentata	Castanea	Fagaceae	American Chestnut	False	False
291	Butterfly Iris	Iris spuria	Iris	Iridaceae	Spuria Iris	True	True
243	Climbing Lily	Gloriosa superba	Gloriosa	Colchicaceae	Gloriosa Lily, Glory Lily, Superb Lily	True	True
4	Measles Plant	Acanthus	Acanthus	Acanthaceae	Polka Dot Plant, Flamingo Plant, Baby’s Tears,...	False	False
246	Orange Star	Guzmania lingulata var. concolor	Guzmania	Bromeliaceae		False	False
420	Algaroba	Prosopis limensis	Prosopis	Fabaceae	Kiawe, Mesquite	False	False

	Name	Scientific Name	Genus	Family	Alternative Names	Toxic to Dogs	Toxic to Cats
0	Sand Verbena	Abronia fragrans	Abronia	Nyctaginaceae	Prairie Snowball, Wild Lantana	False	False
1	Prayer Bean	Abrus precatorius	Abrus	Fabaceae	Rosary Pea, Buddhist Rosary Bead, Indian Bead,...	True	True
2	Copperleaf	Acalypha godseffiana	Acalypha	Euphorbiaceae	Lance Copperleaf	False	False
3	Chenille Plant	Acalypha hispida	Acalypha	Euphorbiaceae	Philippine Medusa, Foxtail, Red-hot Cat Tail	False	False
4	Measles Plant	Acanthus	Acanthus	Acanthaceae	Polka Dot Plant, Flamingo Plant, Baby’s Tears,...	False	False

	Name	Scientific Name	Genus	Family	Toxic to Dogs	Toxic to Cats
262	Day Lilies (many varieties)	Hemerocallis	Hemerocallis	Xanthorrhoeaceae	False	True
263	Orange Day Lily	Hemerocallis graminea	Hemerocallis	Xanthorrhoeaceae	False	True
296	Black Walnut	Juglans nigra	Juglans	Juglandaceae	True	False
317	Lily	Lilium	Lilium	Liliaceae	False	True
319	Tiger Lily	Lilium lancifolium	Lilium	Liliaceae	False	True
320	Easter Lily	Lilium longiflorum	Lilium	Liliaceae	False	True
321	Red Lily	Lilium philadelphicum	Lilium	Liliaceae	False	True
322	Japanese Show Lily	Lilium speciosum	Lilium	Liliaceae	False	True

	Toxic to Cats	Toxic to Dogs
Family
Lauraceae	0.500000	0.500000
Proteaceae	0.500000	0.500000
Convolvulaceae	0.500000	0.500000
Commelinaceae	0.500000	0.500000
Euphorbiaceae	0.600000	0.600000
Fabaceae	0.600000	0.600000
Berberidaceae	0.666667	0.666667
Polygonaceae	0.666667	0.666667
Apiaceae	0.666667	0.666667
Moraceae	0.666667	0.666667

	Toxic to Cats	Toxic to Dogs
Genus
Schefflera	0.666667	0.666667
Cordyline	0.666667	0.666667
Iris	0.666667	0.666667
Aloe	0.666667	0.666667
Dracaena	0.800000	0.800000
Aralia	1.000000	1.000000
Ficus	1.000000	1.000000
Apocynum	1.000000	1.000000
Sansevieria	1.000000	1.000000
Rumex	1.000000	1.000000