Webscraping Text and Images with BeautifulSoup example
This notebook code is from the app found here: https://github.com/kenichinakanishi/houseplant_classifier/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
def getHTMLContent(link):
html = urlopen(link)
soup = BeautifulSoup(html, 'html.parser')
return soup
req = Request('https://www.aspca.org/pet-care/animal-poison-control/cats-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Soupify the webpage
soup = BeautifulSoup(webpage, 'lxml')
# Search through the parse tree to get all the content from the table
content_list = soup.find_all('span')[7:-4]
# Put it in a dataframe for further processing
df_cats = pd.DataFrame(content_list)
# Clean up the strings
df_cats[0] = df_cats[0].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[4] = df_cats[4].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[1] = df_cats[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_cats = df_cats.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_cats['Toxic to Cats'] = True
first_nontoxic_cats = [index for index in df_cats[df_cats['Name'].str.startswith('A')].index if index>100][0]
df_cats.loc[first_nontoxic_cats:,'Toxic to Cats'] = False
df_cats
req = Request('https://www.aspca.org/pet-care/animal-poison-control/dogs-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'lxml') # soupify the webpage
content_list = soup.find_all('span')[7:-4] # Get all the content from the table
df_dogs = pd.DataFrame(content_list) # Put it in a dataframe for processing
# Clean up the strings
df_dogs[0] = df_dogs[0].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[4] = df_dogs[4].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[1] = df_dogs[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_dogs = df_dogs.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_dogs['Toxic to Dogs'] = True
first_nontoxic_dogs = [index for index in df_dogs[df_dogs['Name'].str.startswith('A')].index if index>100][0]
df_dogs.loc[first_nontoxic_dogs:,'Toxic to Dogs'] = False
# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)
# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)
aspca_df.sample(10)
aspca_df = aspca_df.drop_duplicates('Scientific Name') # Get rid of duplicates
aspca_df = aspca_df.reset_index(drop=True).sort_index() # Reset and sort index
aspca_df = aspca_df.drop(aspca_df[aspca_df['Scientific Name'].isin(['','NONE LISTED'])].index,axis=0).reset_index(drop=True).sort_index() # Fix mistakes in database
# Ensure proper punctuation for each scientific name.
def normalize_capitalization(x):
first_word, rest = x.split()[0], x.split()[1:]
first_word = [first_word.capitalize()]
rest = [word.lower() for word in rest]
return ' '.join(first_word+rest)
# Clean up repeated species that have different names
def species_normalizer(word):
if word.split()[-1] in ['sp','species','spp','sp.','spp.']:
word = ''.join(word.split()[:-1])
return word
# Remove cv from names, as it is an outdated way of referring to cultivars
def cv_remover(word):
if 'cv' in word:
word = word.replace(' cv ',' ')
return word
# Remove var. from names
def var_remover(word):
if 'var' in word:
word = word.replace(' var. ',' ')
return word
# Apply each of the functions
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(normalize_capitalization)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(species_normalizer)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(cv_remover)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(var_remover)
# Remove special characters
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(lambda x: ''.join([character for character in x if character.isalnum() or character.isspace()]))
# Reset dataframe for further processing
aspca_df = aspca_df.sort_values('Scientific Name').drop_duplicates('Scientific Name')
aspca_df = aspca_df.reset_index(drop=True).sort_index()
aspca_df.sample(10)
use_cols = ['scientificName','taxonRank','family','genus','taxonomicStatus','taxonID', 'acceptedNameUsageID']
wfo_df = pd.read_csv('../classification.txt', sep='\t', lineterminator='\n', usecols=use_cols)
wfo_df = wfo_df.sort_values('taxonomicStatus')
wfo_df.sample(10)
# Don't need this column, we trust the WFO database more
aspca_df.drop('Family', axis=1, inplace=True)
# Merge dataframes together to get trusted info
aspca_df = aspca_df.merge(wfo_df, how = 'left', left_on = ['Scientific Name'], right_on = ['scientificName'])
# Sort by taxonomicStatus and drop duplicates keeping the first - keeping accepted names as priority
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
# Fill NaN's with Unknown
aspca_df = aspca_df.fillna('Unknown')
# Clean up and deal with scientific names that are unknown, due to misspellings or otherwise.
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(len(unknown_idx))
def get_closest_name(unknown_name, name_df = wfo_df, name_col = 'scientificName', threshold=0.9, verbose=False):
""" Matches an 'unknown_name' against accepted names in a 'name_df'. Will return names that are above a 'threshold' of closeness.
Parameters
----------
unknown_name: str
Name we want to match against accepted names.
name_df: DataFrame
DataFrame containing accepted names.
name_col: str, name of name_df column
DataFrame column containing accepted names.
threshold: int
How closely does the unknown_name need to match with the accepted name.
If above this threshold, the name is added to a dictionary of possible names.
verbose: bool
Should the function print the entire list of possible names.
Returns:
----------
str
Closest name to 'unknown_name' that was above the given 'threshold'.
"""
import operator
from difflib import SequenceMatcher
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
poss_names = {}
# Only look through entries with the same first letter to save time
for true_sciname in name_df[name_df[name_col].str.startswith(unknown_name[0])][name_col].values:
similar_score = similar(unknown_name, true_sciname)
if similar_score>threshold:
poss_names[true_sciname]=similar_score
# If the dict is empty
if verbose == True:
print(poss_names)
if not bool(poss_names):
print(f'No names close enough to {unknown_name}.')
return ''
else:
print(f'{unknown_name} is closest to {max(poss_names.items(), key=operator.itemgetter(1))[0]}, with a score of {max(poss_names.items(), key=operator.itemgetter(1))[1]:.2f}')
return max(poss_names.items(), key=operator.itemgetter(1))[0]
def fix_name(unknown_name, true_name):
""" Fixes the aspca_df entries according to the accepted wfo_df entry.
Parameters
----------
unknown_name: str
Name we want to fix.
true_name: DataFrame
Accepted name to use.
"""
# Get the series we're looking to change
unknown_data = aspca_df[aspca_df['Scientific Name'] == unknown_name]
# Grab accepted data from wfo database based on ID lookup
true_data = wfo_df[wfo_df['scientificName'] == true_name]
true_sciname = true_data.loc[:,'scientificName'].values[0]
true_family = true_data.loc[:,'family'].values[0]
true_genus = true_data.loc[:,'genus'].values[0]
true_taxonomicStatus = true_data.loc[:,'taxonomicStatus'].values[0]
# Change scientific name, family, genus and taxonomic status to accepted versions
aspca_df.iloc[unknown_data.index,2] = true_sciname
aspca_df.iloc[unknown_data.index,8] = true_family
aspca_df.iloc[unknown_data.index,9] = true_genus
aspca_df.iloc[unknown_data.index,10] = true_taxonomicStatus
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(f'{len(unknown_idx)} plants currently cannot be matched.')
from tqdm.notebook import tqdm
for i in tqdm(unknown_idx):
unknown_name = aspca_df.iloc[i,2]
closest_name = get_closest_name(unknown_name)
if closest_name == '':
continue
fix_name(unknown_name,closest_name)
# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)
# Manually fix some scientific names that don't match anything on record automatically
fix_name('Nephrolepsis cordifolia plumosa', 'Nephrolepis cordifolia')
fix_name('Nephrolepsis cordifolia duffii', 'Nephrolepis cordifolia')
fix_name('Nephrolepis exalta bostoniensis', 'Nephrolepis exaltata')
fix_name('Neoregalia', 'Neoregelia')
fix_name('Miltonia roezlii alba', 'Miltonia roezlii')
fix_name('Maranta insignis', 'Calathea insignis')
fix_name('Lilium orientalis', 'Lilium japonicum')
fix_name('Lampranthus piquet', 'Lampranthus piquetbergensis')
fix_name('Hoya carnosa krinkle kurl', 'Hoya carnosa')
fix_name('Hemigraphis exotica', 'Hemigraphis alternata')
fix_name('Lilium asiatica', 'Lilium japonicum')
fix_name('Nolina tuberculata', 'Beaucarnea recurvata')
fix_name('Giant dracaena', 'Cordyline australis')
fix_name('Scindapsusphilodendron', 'Philodendron scandens')
fix_name('Schefflera or brassia actinoplylla', 'Schefflera actinophylla')
fix_name('Phoenix robellinii', 'Phoenix roebelenii')
fix_name('Peperomia serpens variegata', 'Peperomia serpens')
fix_name('Bertolonia mosaica', 'Fittonia albivenis')
fix_name('Begonia semperflorens cultivar', 'Begonia semperflorens')
fix_name('Begonia rex peace', 'Begonia rex')
fix_name('Asparagus densiflorus sprengeri', 'Asparagus densiflorus')
fix_name('Albiflora', 'Tradescantia zebrina')
fix_name('Acantha', 'Acanthus')
fix_name('Episcia cultivar', 'Episcia')
fix_name('Echevaria', 'Echeveria')
fix_name('Echeveria puloliver', 'Echeveria harmsii')
fix_name('Dypsis lutescens chrysalidocarpus lutescens alternate scientific name', 'Dypsis lutescens')
fix_name('Draceana', 'Dracaena')
fix_name('Daucus carota sativa', 'Daucus carota')
fix_name('Ceratostigma larpentiae', 'Ceratostigma plumbaginoides')
fix_name('Cycasrevolutazamia', 'Cycas revoluta')
fix_name('Cucurbita maxima turbaniformis', 'Cucurbita maxima')
fix_name('Cucurbita maxima hubbard', 'Cucurbita maxima')
fix_name('Cucurbita maxima butternut', 'Cucurbita maxima')
fix_name('Cucurbita maxima banana', 'Cucurbita maxima')
fix_name('Cucurbita maxima buttercup', 'Cucurbita maxima')
fix_name('Cucurbia pepo zucchini', 'Cucurbita pepo')
fix_name('Cryptanthus bivattus minor', 'Cryptanthus bivittatus')
fix_name('Cycasandzamia', 'Cycas')
# Manually match up synonyms that don't have a database link to the accepted name
fix_name('Chlorophytum bichetii', 'Chlorophytum laxum')
fix_name('Rhapis flabelliformis', 'Rhapis excelsa')
fix_name('Cleome hassleriana', 'Cleome spinosa')
fix_name('Pellionia pulchra', 'Pellionia repens')
fix_name('Cissus discolor', 'Cissus javana')
fix_name('Miltonia roezlii', 'Miltoniopsis roezlii')
fix_name('Sorghum vulgare var. sudanense', 'Sorghum bicolor')
fix_name('Camellia japonica var. japonica', 'Camellia japonica')
fix_name('Onychium japonicum', 'Onychium japonicum')
fix_name('Epidendrum atropurpureum', 'Psychilis atropurpurea')
fix_name('Philodendron scandens', 'Philodendron hederaceum')
fix_name('Origanum vulgare var. hirtum', 'Origanum vulgare subsp. hirtum')
fix_name('Guzmania lingulata var. minor', 'Guzmania lingulata var. concolor')
fix_name('Lavandula angustifolia', 'Lavandula angustifolia')
fix_name('Begonia semperflorens', 'Begonia cucullata')
fix_name('Calathea insignis', 'Calathea crotalifera')
fix_name('Citrus ×limonia', 'Citrus limon')
fix_name('Coleus amboinicus', 'Plectranthus amboinicus')
fix_name('Rhipsalis cassytha', 'Rhipsalis dichotoma')
fix_name('Lycopersicon', 'Solanum lycopersicum')
fix_name('Lachenalia lilacina', 'Iris domestica')
fix_name('Cymopterus watsonii', 'Cymopterus terebinthinus')
# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')
# Work to update the remaining scientific names that are synonyms for their accepted scientific names
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
for i in synonym_idx:
# Get the series we're looking to change
synonym_data = aspca_df.iloc[i,:]
synonym_name = synonym_data.loc['Scientific Name']
# Grab accepted data from wfo database based on ID lookup
true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
true_sciname = true_data.iloc[:,1].values[0]
fix_name(synonym_name,true_sciname)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')
# Sort and drop again
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first')
aspca_df = aspca_df.sort_values('Scientific Name').reset_index(drop=True).sort_index()
# Set genus of one-word names to be the name, rather than NaN
aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'genus'] = aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'Scientific Name']
# Drop columns we no longer need
aspca_df = aspca_df.drop(['taxonID', 'scientificName', 'taxonomicStatus', 'acceptedNameUsageID', 'taxonRank'], axis=1)
# Standardize column names
aspca_df.rename(columns = {'genus':'Genus', 'family':'Family'}, inplace=True)
# Reorder columns
cols = ['Name', 'Scientific Name', 'Genus', 'Family', 'Alternative Names', 'Toxic to Dogs', 'Toxic to Cats']
aspca_df = aspca_df[cols]
aspca_df.to_csv('Plant Toxicity - v6.csv')
aspca_df.sample(10)
aspca_df.head()
aspca_df[aspca_df['Toxic to Dogs'] != aspca_df['Toxic to Cats']]
aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[70:80]
# How many Families have mixed toxicity
len(aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])
# How many Families
len(aspca_df['Family'].unique())
aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[208:218]
# How many Genuses have mixed toxicity
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])
# How many Genuses
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs'))
# If running in Colabs
!pip install selenium -q
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver -q
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# Import and setup the Selenium webdriver
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)