This notebook code is from the app found here: https://github.com/kenichinakanishi/houseplant_classifier/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup
req = Request('https://www.aspca.org/pet-care/animal-poison-control/cats-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Soupify the webpage
soup = BeautifulSoup(webpage, 'lxml')       
# Search through the parse tree to get all the content from the table         
content_list = soup.find_all('span')[7:-4]       
# Put it in a dataframe for further processing
df_cats = pd.DataFrame(content_list)           
/home/gao/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py:305: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  values = np.array([convert(v) for v in values])
# Clean up the strings
df_cats[0] = df_cats[0].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[4] = df_cats[4].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[1] = df_cats[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_cats = df_cats.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_cats['Toxic to Cats'] = True
first_nontoxic_cats = [index for index in df_cats[df_cats['Name'].str.startswith('A')].index if index>100][0]
df_cats.loc[first_nontoxic_cats:,'Toxic to Cats'] = False
df_cats
Name Alternative Names Scientific Name Family Toxic to Cats
0 Adam-and-Eve Arum, Lord-and-Ladies, Wake Robin, Starch Root... Arum maculatum Araceae True
1 African Wonder Tree Ricinus communis True
2 Alocasia Elephant's Ear Alocasia spp. Araceae True
3 Aloe Aloe vera Liliaceae True
4 Amaryllis Many, including: Belladonna lily, Saint Joseph... Amaryllis spp. Amaryllidaceae True
... ... ... ... ... ...
980 Yellowrocket Barbarea vulgaris Brassicaceae False
981 Yorba Linda Peperomia rotundifolia Piperaceae False
982 Zebra Haworthia Haworthia fasciata Liliaceae False
983 Zinnia Zinnia species Asteraceae False
984 Zucchini Squash Cucurbia pepo cv zucchini Cucurbitaceae False

985 rows × 5 columns

req = Request('https://www.aspca.org/pet-care/animal-poison-control/dogs-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'lxml')                 # soupify the webpage
content_list = soup.find_all('span')[7:-4]            # Get all the content from the table
df_dogs = pd.DataFrame(content_list)                  # Put it in a dataframe for processing
/home/gao/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py:305: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  values = np.array([convert(v) for v in values])
# Clean up the strings
df_dogs[0] = df_dogs[0].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[4] = df_dogs[4].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[1] = df_dogs[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_dogs = df_dogs.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_dogs['Toxic to Dogs'] = True
first_nontoxic_dogs = [index for index in df_dogs[df_dogs['Name'].str.startswith('A')].index if index>100][0]
df_dogs.loc[first_nontoxic_dogs:,'Toxic to Dogs'] = False
# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)
# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)
aspca_df.sample(10)
Name Alternative Names Scientific Name Family Toxic to Dogs Toxic to Cats
810 Pink Splash Flamingo Plant, Polka Dot Plant, Measles Plant... Hypoestes phyllostachya Acanthaceae False False
120 English Ivy Branching Ivy, Glacier Ivy, Needlepoint Ivy, S... Hedera helix Araliaceae True True
564 Crape Myrtle Crepe Myrtle Lagerstroemia indica Lythraceae False False
201 Japanese Yew English Yew, Western Yew, Pacific Yew, Anglo-J... Taxus sp. Taxaceae True True
635 Giant Touch-Me-Not Buzzy Lizzie, Impatience Plant, Patient Lucy, ... Impatiens spp. Balsaminaceae False False
92 Cowbane Water Hemlock, Poison Parsnip Cicuta species Apiaceae True True
277 Ornamental Pepper Natal Cherry, Winter Cherry, Jerusalem Cherry Solanum pseudocapsicum Solanaceae True True
513 Carrot Fern Onychium japonica Polypodiaceae False False
712 Leather Peperomia Peperomia crassifolia Piperaceae False False
493 California Pitcher Plant Cobra Orchid, Cobra Plant, Cobra Lily, Chrysam... Darlingtonia californica Sarraceniaceae False False
aspca_df = aspca_df.drop_duplicates('Scientific Name') # Get rid of duplicates
aspca_df = aspca_df.reset_index(drop=True).sort_index()   # Reset and sort index
aspca_df = aspca_df.drop(aspca_df[aspca_df['Scientific Name'].isin(['','NONE LISTED'])].index,axis=0).reset_index(drop=True).sort_index()    # Fix mistakes in database
# Ensure proper punctuation for each scientific name.
def normalize_capitalization(x):
  first_word, rest = x.split()[0], x.split()[1:]
  first_word = [first_word.capitalize()]
  rest = [word.lower() for word in rest]
  return ' '.join(first_word+rest)

# Clean up repeated species that have different names
def species_normalizer(word):
  if word.split()[-1] in ['sp','species','spp','sp.','spp.']:
    word = ''.join(word.split()[:-1])
  return word

# Remove cv from names, as it is an outdated way of referring to cultivars
def cv_remover(word):
  if 'cv' in word:
    word = word.replace(' cv ',' ')
  return word

# Remove var. from names
def var_remover(word):
  if 'var' in word:
    word = word.replace(' var. ',' ')
  return word

# Apply each of the functions
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(normalize_capitalization)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(species_normalizer)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(cv_remover)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(var_remover)

# Remove special characters
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(lambda x: ''.join([character for character in x if character.isalnum() or character.isspace()]))

# Reset dataframe for further processing
aspca_df = aspca_df.sort_values('Scientific Name').drop_duplicates('Scientific Name')
aspca_df = aspca_df.reset_index(drop=True).sort_index()
aspca_df.sample(10)
Name Alternative Names Scientific Name Family Toxic to Dogs Toxic to Cats
108 American Bittersweet Bittersweet, Waxwork, Shrubby Bittersweet, Fal... Celastrus scandens Celastraceae True True
530 Pacific Yew English Yew, Western Yew, Japanese Yew, Anglo-... Taxus brevifolia Taxaceae True True
467 Pie Plant Rhubarb Rheum rhabarbarium Polygonaceae True True
164 Pheasant Plant Zebra Plant Cryptanthus zonatus Bromeliaceae False False
452 Primrose Primula vulgaris Primulaceae True True
506 Jackson Brier Smilax lanceolata Liliaceae False False
407 Ivy Peperomia Plantinum Peperomia, Silver leaf Peperomia, Iv... Peperomia griseoargentea Piperaceae False False
147 Poison Hemlock Poison Parsley, Spotted Hemlock, Winter Fern, ... Conium maculatum Umbelliferae True True
351 Cardinal Flower Lobelia, Indian Pink Lobelia cardinalis Campanulaceae True True
236 Pink Brocade Episcia cultivar Gesneriaceae False False
use_cols = ['scientificName','taxonRank','family','genus','taxonomicStatus','taxonID', 'acceptedNameUsageID']
wfo_df = pd.read_csv('../classification.txt', sep='\t', lineterminator='\n', usecols=use_cols)
wfo_df = wfo_df.sort_values('taxonomicStatus')
/home/gao/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
wfo_df.sample(10)
taxonID scientificName taxonRank family genus taxonomicStatus acceptedNameUsageID
796160 wfo-0000798814 Peridium oblongifolium SPECIES Peraceae Peridium Synonym wfo-0000267144
180708 wfo-0000180970 Cracca smallii SPECIES Fabaceae Cracca Synonym wfo-0000178756
911945 wfo-0000914633 Thinopyrum turcicum SPECIES Poaceae Thinopyrum Synonym wfo-0000866236
167159 wfo-0000167369 Indigofera cinerea SPECIES Fabaceae Indigofera Synonym wfo-0000173646
642316 wfo-0000644639 Diaphanoptera khorasanica SPECIES Caryophyllaceae Diaphanoptera Accepted NaN
464965 wfo-0000466716 Phyllocyclus minutiflorus SPECIES Gentianaceae Phyllocyclus Doubtful NaN
740337 wfo-0000742945 Daphne pseudomezereum var. koreana VARIETY Thymelaeaceae Daphne Synonym wfo-0000637684
868404 wfo-0000871073 Festuca montis-aurei SPECIES Poaceae Festuca Synonym wfo-0000869683
186218 wfo-0000186502 Lotononis curvicarpa SPECIES Fabaceae Lotononis Accepted NaN
552490 wfo-0000554468 Specklinia casualis SPECIES Orchidaceae Specklinia Synonym wfo-0000339564
# Don't need this column, we trust the WFO database more
aspca_df.drop('Family', axis=1, inplace=True)
# Merge dataframes together to get trusted info
aspca_df = aspca_df.merge(wfo_df, how = 'left', left_on = ['Scientific Name'], right_on = ['scientificName'])
# Sort by taxonomicStatus and drop duplicates keeping the first - keeping accepted names as priority
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
# Fill NaN's with Unknown
aspca_df = aspca_df.fillna('Unknown')
# Clean up and deal with scientific names that are unknown, due to misspellings or otherwise.
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(len(unknown_idx))
101
def get_closest_name(unknown_name, name_df = wfo_df, name_col = 'scientificName', threshold=0.9, verbose=False):
  """ Matches an 'unknown_name' against accepted names in a 'name_df'. Will return names that are above a 'threshold' of closeness. 

  Parameters
  ----------
  unknown_name: str
    Name we want to match against accepted names. 
  name_df: DataFrame
    DataFrame containing accepted names.
  name_col: str, name of name_df column 
    DataFrame column containing accepted names.
  threshold: int
    How closely does the unknown_name need to match with the accepted name.
    If above this threshold, the name is added to a dictionary of possible names.
  verbose: bool
    Should the function print the entire list of possible names. 

  Returns:
  ----------
  str
    Closest name to 'unknown_name' that was above the given 'threshold'.
  """
  import operator
  from difflib import SequenceMatcher
  def similar(a, b):
      return SequenceMatcher(None, a, b).ratio()
  poss_names = {}
  # Only look through entries with the same first letter to save time
  for true_sciname in name_df[name_df[name_col].str.startswith(unknown_name[0])][name_col].values:
    similar_score = similar(unknown_name, true_sciname)
    if similar_score>threshold:
      poss_names[true_sciname]=similar_score
  # If the dict is empty
  if verbose == True:
    print(poss_names)
  if not bool(poss_names):
    print(f'No names close enough to {unknown_name}.')
    return ''
  else:
    print(f'{unknown_name} is closest to {max(poss_names.items(), key=operator.itemgetter(1))[0]}, with a score of {max(poss_names.items(), key=operator.itemgetter(1))[1]:.2f}')
    return max(poss_names.items(), key=operator.itemgetter(1))[0]
def fix_name(unknown_name, true_name):
  """ Fixes the aspca_df entries according to the accepted wfo_df entry.

  Parameters
  ----------
  unknown_name: str
    Name we want to fix. 
  true_name: DataFrame
    Accepted name to use.
  """
  # Get the series we're looking to change
  unknown_data = aspca_df[aspca_df['Scientific Name'] == unknown_name]
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['scientificName'] == true_name]
  true_sciname = true_data.loc[:,'scientificName'].values[0]
  true_family = true_data.loc[:,'family'].values[0]
  true_genus = true_data.loc[:,'genus'].values[0]
  true_taxonomicStatus = true_data.loc[:,'taxonomicStatus'].values[0]
  # Change scientific name, family, genus and taxonomic status to accepted versions
  aspca_df.iloc[unknown_data.index,2] = true_sciname
  aspca_df.iloc[unknown_data.index,8] = true_family
  aspca_df.iloc[unknown_data.index,9] = true_genus
  aspca_df.iloc[unknown_data.index,10] = true_taxonomicStatus
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(f'{len(unknown_idx)} plants currently cannot be matched.')
from tqdm.notebook import tqdm
for i in tqdm(unknown_idx):
  unknown_name = aspca_df.iloc[i,2]
  closest_name = get_closest_name(unknown_name)
  if closest_name == '':
    continue
  fix_name(unknown_name,closest_name)
101 plants currently cannot be matched.
Malus sylvestrus is closest to Malus sylvestris, with a score of 0.94
No names close enough to Maranta insignis.
No names close enough to Miltonia roezlii alba.
No names close enough to Neoregalia.
No names close enough to Nephrolepis exalta bostoniensis.
Nephrolepsis exalta is closest to Nephrolepis exaltata, with a score of 0.92
No names close enough to Nephrolepsis cordifolia duffii.
No names close enough to Lilium orientalis.
No names close enough to Nephrolepsis cordifolia plumosa.
Nephrolepis exalta is closest to Nephrolepis exaltata, with a score of 0.95
No names close enough to Lilium asiatica.
Hosta plataginea is closest to Hosta plantaginea, with a score of 0.97
No names close enough to Lampranthus piquet.
Kalmia poliifolia is closest to Kalmia polifolia, with a score of 0.97
Kalmia augustifolia is closest to Kalmia angustifolia, with a score of 0.95
Jasminium is closest to Jasminum, with a score of 0.94
Hoya publcalyx is closest to Hoya pubicalyx, with a score of 0.93
No names close enough to Hoya carnosa krinkle kurl.
No names close enough to Hemigraphis exotica.
Gynura aurantica is closest to Gynura aurantiaca, with a score of 0.97
No names close enough to Nolina tuberculata.
Guzmania lingulata minor is closest to Guzmania lingulata var. minor, with a score of 0.91
Lavendula angustifolia is closest to Lavandula angustifolia, with a score of 0.95
Onychium japonica is closest to Onychium japonicum, with a score of 0.91
No names close enough to Schefflera or brassia actinoplylla.
Paeonis officinalis is closest to Paeonia officinalis, with a score of 0.95
No names close enough to Giant dracaena.
Taxus canadensus is closest to Taxus canadensis, with a score of 0.94
Stapelia hirsata is closest to Stapelia hirsuta, with a score of 0.94
Sorghum vulgare var sudanesis is closest to Sorghum vulgare var. sudanense, with a score of 0.92
Smilax walteria is closest to Smilax walteri, with a score of 0.97
Secum weinbergii is closest to Sedum weinbergii, with a score of 0.94
No names close enough to Scindapsusphilodendron.
Santpaulia confusa is closest to Saintpaulia confusa, with a score of 0.97
Rhipsalis cassutha is closest to Rhipsalis cassytha, with a score of 0.94
Rheum rhabarbarium is closest to Rheum rhabarbarum, with a score of 0.97
Origanum vulgare hirtum is closest to Origanum vulgare var. hirtum, with a score of 0.90
Tolmeia menziesii is closest to Tolmiea menziesii, with a score of 0.94
Podocarpus macrophylla is closest to Podocarpus macrophyllus, with a score of 0.93
Ploystichum munitum is closest to Polystichum munitum, with a score of 0.95
Plectranthus oetendahlii is closest to Plectranthus oertendahlii, with a score of 0.98
Plantanus occidentalis is closest to Platanus occidentalis, with a score of 0.98
Pilea cadieri is closest to Pilea cadierei, with a score of 0.96
No names close enough to Phoenix robellinii.
No names close enough to Peperomia serpens variegata.
Peperomia prostata is closest to Peperomia prostrata, with a score of 0.97
Peperomia griseoargentea is closest to Peperomia griseoargentia, with a score of 0.96
Pellonia pulchra is closest to Pellionia pulchra, with a score of 0.97
Rhapis flabelliformus is closest to Rhapis flabelliformis, with a score of 0.95
Fuschsia is closest to Fuchsia, with a score of 0.93
No names close enough to Begonia rex peace.
Eriogonium umbellatum is closest to Eriogonum umbellatum, with a score of 0.98
Citrus aurantifolia is closest to Citrus aurantiifolia, with a score of 0.97
Cissus dicolor is closest to Cissus discolor, with a score of 0.97
Chlorophytum bichetti is closest to Chlorophytum bichetii, with a score of 0.95
No names close enough to Ceratostigma larpentiae.
Cattleya trianaei is closest to Cattleya trianae, with a score of 0.97
Camellia japonica thea japonica is closest to Camellia japonica var. japonica, with a score of 0.90
Caesalpinia gilliessi is closest to Caesalpinia gilliesii, with a score of 0.95
Borage officinalis is closest to Borago officinalis, with a score of 0.94
No names close enough to Bertolonia mosaica.
No names close enough to Begonia semperflorens cultivar.
Begonia scharfii is closest to Begonia scharffii, with a score of 0.97
Begonia cleopatra is closest to Begonia cleopatrae, with a score of 0.97
No names close enough to Asparagus densiflorus sprengeri.
Arum palestinum is closest to Arum palaestinum, with a score of 0.97
Anthurium scherzeranum is closest to Anthurium scherzerianum, with a score of 0.98
Anthirrhinum multiflorum is closest to Antirrhinum multiflorum, with a score of 0.98
Anoectuchilus setaceus is closest to Anoectochilus setaceus, with a score of 0.95
Anethum graveolena is closest to Anethum graveolens, with a score of 0.94
No names close enough to Albiflora.
No names close enough to Acantha.
Tradescantia flumeninsis is closest to Tradescantia fluminensis, with a score of 0.92
Citrus aurantium is closest to Citrus ×aurantium, with a score of 0.97
Euonymus atropurpurea is closest to Euonymus atropurpureus, with a score of 0.93
Citrus limonia is closest to Citrus ×limonia, with a score of 0.97
Cleome hasserlana is closest to Cleome hassleriana, with a score of 0.91
Eriogonium inflatum is closest to Eriogonum inflatum, with a score of 0.97
No names close enough to Episcia cultivar.
Epidendrum atropurpeum is closest to Epidendrum atropurpureum, with a score of 0.96
Eleagnus is closest to Elaeagnus, with a score of 0.94
No names close enough to Echeveria puloliver.
Echeveria pulinata is closest to Echeveria pulvinata, with a score of 0.97
No names close enough to Echevaria.
No names close enough to Dypsis lutescens chrysalidocarpus lutescens alternate scientific name.
No names close enough to Draceana.
No names close enough to Daucus carota sativa.
Citrus paradisii is closest to Citrus paradisi, with a score of 0.97
No names close enough to Cycasrevolutazamia.
No names close enough to Cucurbita maxima turbaniformis.
No names close enough to Cucurbita maxima hubbard.
No names close enough to Cucurbita maxima butternut.
No names close enough to Cucurbita maxima buttercup.
No names close enough to Cucurbita maxima banana.
No names close enough to Cucurbia pepo zucchini.
No names close enough to Cryptanthus bivattus minor.
Coleus ampoinicus is closest to Coleus amboinicus, with a score of 0.94
Clivia minata is closest to Clivia miniata, with a score of 0.96
Clintonia umbelluata is closest to Clintonia umbellulata, with a score of 0.98
No names close enough to Cycasandzamia.
Veitchia merillii is closest to Veitchia merrillii, with a score of 0.97

# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)
52
# Manually fix some scientific names that don't match anything on record automatically
fix_name('Nephrolepsis cordifolia plumosa', 'Nephrolepis cordifolia')
fix_name('Nephrolepsis cordifolia duffii', 'Nephrolepis cordifolia')
fix_name('Nephrolepis exalta bostoniensis', 'Nephrolepis exaltata')
fix_name('Neoregalia', 'Neoregelia')
fix_name('Miltonia roezlii alba', 'Miltonia roezlii')
fix_name('Maranta insignis', 'Calathea insignis')
fix_name('Lilium orientalis', 'Lilium japonicum')
fix_name('Lampranthus piquet', 'Lampranthus piquetbergensis')
fix_name('Hoya carnosa krinkle kurl', 'Hoya carnosa')
fix_name('Hemigraphis exotica', 'Hemigraphis alternata')
fix_name('Lilium asiatica', 'Lilium japonicum')
fix_name('Nolina tuberculata', 'Beaucarnea recurvata')
fix_name('Giant dracaena', 'Cordyline australis')
fix_name('Scindapsusphilodendron', 'Philodendron scandens')
fix_name('Schefflera or brassia actinoplylla', 'Schefflera actinophylla')
fix_name('Phoenix robellinii', 'Phoenix roebelenii')
fix_name('Peperomia serpens variegata', 'Peperomia serpens')
fix_name('Bertolonia mosaica', 'Fittonia albivenis')
fix_name('Begonia semperflorens cultivar', 'Begonia semperflorens')
fix_name('Begonia rex peace', 'Begonia rex')
fix_name('Asparagus densiflorus sprengeri', 'Asparagus densiflorus')
fix_name('Albiflora', 'Tradescantia zebrina')
fix_name('Acantha', 'Acanthus')
fix_name('Episcia cultivar', 'Episcia')
fix_name('Echevaria', 'Echeveria')
fix_name('Echeveria puloliver', 'Echeveria harmsii')
fix_name('Dypsis lutescens chrysalidocarpus lutescens alternate scientific name', 'Dypsis lutescens')
fix_name('Draceana', 'Dracaena')
fix_name('Daucus carota sativa', 'Daucus carota')
fix_name('Ceratostigma larpentiae', 'Ceratostigma plumbaginoides')
fix_name('Cycasrevolutazamia', 'Cycas revoluta')
fix_name('Cucurbita maxima turbaniformis', 'Cucurbita maxima')
fix_name('Cucurbita maxima hubbard', 'Cucurbita maxima')
fix_name('Cucurbita maxima butternut', 'Cucurbita maxima')
fix_name('Cucurbita maxima banana', 'Cucurbita maxima')
fix_name('Cucurbita maxima buttercup', 'Cucurbita maxima')
fix_name('Cucurbia pepo zucchini', 'Cucurbita pepo')
fix_name('Cryptanthus bivattus minor', 'Cryptanthus bivittatus')
fix_name('Cycasandzamia', 'Cycas')
# Manually match up synonyms that don't have a database link to the accepted name
fix_name('Chlorophytum bichetii', 'Chlorophytum laxum')
fix_name('Rhapis flabelliformis', 'Rhapis excelsa')
fix_name('Cleome hassleriana', 'Cleome spinosa')
fix_name('Pellionia pulchra', 'Pellionia repens')
fix_name('Cissus discolor', 'Cissus javana')
fix_name('Miltonia roezlii', 'Miltoniopsis roezlii')
fix_name('Sorghum vulgare var. sudanense', 'Sorghum bicolor')
fix_name('Camellia japonica var. japonica', 'Camellia japonica')
fix_name('Onychium japonicum', 'Onychium japonicum')
fix_name('Epidendrum atropurpureum', 'Psychilis atropurpurea')
fix_name('Philodendron scandens', 'Philodendron hederaceum')
fix_name('Origanum vulgare var. hirtum', 'Origanum vulgare subsp. hirtum')
fix_name('Guzmania lingulata var. minor', 'Guzmania lingulata var. concolor')
fix_name('Lavandula angustifolia', 'Lavandula angustifolia')
fix_name('Begonia semperflorens', 'Begonia cucullata')
fix_name('Calathea insignis', 'Calathea crotalifera')
fix_name('Citrus ×limonia', 'Citrus limon')
fix_name('Coleus amboinicus', 'Plectranthus amboinicus')
fix_name('Rhipsalis cassytha', 'Rhipsalis dichotoma')
fix_name('Lycopersicon', 'Solanum lycopersicum')
fix_name('Lachenalia lilacina', 'Iris domestica')
fix_name('Cymopterus watsonii', 'Cymopterus terebinthinus')
# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)
1
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')
71 entries have a more acceptable synonym
# Work to update the remaining scientific names that are synonyms for their accepted scientific names
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
for i in synonym_idx:
  # Get the series we're looking to change
  synonym_data = aspca_df.iloc[i,:]
  synonym_name = synonym_data.loc['Scientific Name']
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
  true_sciname = true_data.iloc[:,1].values[0]
  fix_name(synonym_name,true_sciname)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-36-d42f5603c8fc> in <module>
      5   # Grab accepted data from wfo database based on ID lookup
      6   true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
----> 7   true_sciname = true_data.iloc[:,1].values[0]
      8   fix_name(synonym_name,true_sciname)

IndexError: index 0 is out of bounds for axis 0 with size 0
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')
31 entries have a more acceptable synonym
# Sort and drop again
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first')
aspca_df = aspca_df.sort_values('Scientific Name').reset_index(drop=True).sort_index()
# Set genus of one-word names to be the name, rather than NaN
aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'genus'] = aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'Scientific Name']
# Drop columns we no longer need
aspca_df = aspca_df.drop(['taxonID', 'scientificName', 'taxonomicStatus', 'acceptedNameUsageID', 'taxonRank'], axis=1)
# Standardize column names
aspca_df.rename(columns = {'genus':'Genus', 'family':'Family'}, inplace=True)
# Reorder columns
cols = ['Name', 'Scientific Name', 'Genus', 'Family', 'Alternative Names', 'Toxic to Dogs', 'Toxic to Cats']
aspca_df = aspca_df[cols]
aspca_df.to_csv('Plant Toxicity - v6.csv')
aspca_df.sample(10)
Name Scientific Name Genus Family Alternative Names Toxic to Dogs Toxic to Cats
102 Celosia Globosa Celosia globosa Celosia Amaranthaceae Globe Amarantha, Perpetua False False
18 Alocasia Alocasia Alocasia Araceae Elephant's Ear True True
386 Variegated Philodendron Philodendron hederaceum Philodendron Araceae True True
411 American Mandrake Podophyllum peltatum Podophyllum Berberidaceae Mayapple, Indian Apple Root, Umbrella Leaf, Wi... True True
94 Chestnut Castanea dentata Castanea Fagaceae American Chestnut False False
291 Butterfly Iris Iris spuria Iris Iridaceae Spuria Iris True True
243 Climbing Lily Gloriosa superba Gloriosa Colchicaceae Gloriosa Lily, Glory Lily, Superb Lily True True
4 Measles Plant Acanthus Acanthus Acanthaceae Polka Dot Plant, Flamingo Plant, Baby’s Tears,... False False
246 Orange Star Guzmania lingulata var. concolor Guzmania Bromeliaceae False False
420 Algaroba Prosopis limensis Prosopis Fabaceae Kiawe, Mesquite False False
aspca_df.head()
Name Scientific Name Genus Family Alternative Names Toxic to Dogs Toxic to Cats
0 Sand Verbena Abronia fragrans Abronia Nyctaginaceae Prairie Snowball, Wild Lantana False False
1 Prayer Bean Abrus precatorius Abrus Fabaceae Rosary Pea, Buddhist Rosary Bead, Indian Bead,... True True
2 Copperleaf Acalypha godseffiana Acalypha Euphorbiaceae Lance Copperleaf False False
3 Chenille Plant Acalypha hispida Acalypha Euphorbiaceae Philippine Medusa, Foxtail, Red-hot Cat Tail False False
4 Measles Plant Acanthus Acanthus Acanthaceae Polka Dot Plant, Flamingo Plant, Baby’s Tears,... False False
aspca_df[aspca_df['Toxic to Dogs'] != aspca_df['Toxic to Cats']]
Name Scientific Name Genus Family Alternative Names Toxic to Dogs Toxic to Cats
262 Day Lilies (many varieties) Hemerocallis Hemerocallis Xanthorrhoeaceae False True
263 Orange Day Lily Hemerocallis graminea Hemerocallis Xanthorrhoeaceae False True
296 Black Walnut Juglans nigra Juglans Juglandaceae True False
317 Lily Lilium Lilium Liliaceae False True
319 Tiger Lily Lilium lancifolium Lilium Liliaceae False True
320 Easter Lily Lilium longiflorum Lilium Liliaceae False True
321 Red Lily Lilium philadelphicum Lilium Liliaceae False True
322 Japanese Show Lily Lilium speciosum Lilium Liliaceae False True
aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[70:80]
Toxic to Cats Toxic to Dogs
Family
Lauraceae 0.500000 0.500000
Proteaceae 0.500000 0.500000
Convolvulaceae 0.500000 0.500000
Commelinaceae 0.500000 0.500000
Euphorbiaceae 0.600000 0.600000
Fabaceae 0.600000 0.600000
Berberidaceae 0.666667 0.666667
Polygonaceae 0.666667 0.666667
Apiaceae 0.666667 0.666667
Moraceae 0.666667 0.666667
# How many Families have mixed toxicity
len(aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])
33
# How many Families
len(aspca_df['Family'].unique())
111
aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[208:218]
Toxic to Cats Toxic to Dogs
Genus
Schefflera 0.666667 0.666667
Cordyline 0.666667 0.666667
Iris 0.666667 0.666667
Aloe 0.666667 0.666667
Dracaena 0.800000 0.800000
Aralia 1.000000 1.000000
Ficus 1.000000 1.000000
Apocynum 1.000000 1.000000
Sansevieria 1.000000 1.000000
Rumex 1.000000 1.000000
# How many Genuses have mixed toxicity
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])
9
# How many Genuses
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs'))
346
# If running in Colabs
!pip install selenium -q
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver -q
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/home/gao/anaconda3/bin/python -m pip install --upgrade pip' command.
Reading package lists... Done
E: Could not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)
E: Unable to lock directory /var/lib/apt/lists/
W: Problem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)
W: Problem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
cp: cannot stat '/usr/lib/chromium-browser/chromedriver': No such file or directory
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# Import and setup the Selenium webdriver
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)