Text summarizer in Python, A Tale of Two Cities
Credit: code from https://github.com/louisteo9/personal-text-summarizer
# Natural Language Tool Kit (NLTK)
import nltk
nltk.download('stopwords')
# Regular Expression for text preprocessing
import re
# Heap (priority) queue algorithm to get the top sentences
import heapq
# NumPy for numerical computing
import numpy as np
# pandas for creating DataFrames
import pandas as pd
# matplotlib for plot
from matplotlib import pyplot as plt
%matplotlib inline
import requests
import re
r = requests.get("https://www.gutenberg.org/files/98/98-0.txt")
raw_text = r.text
print(raw_text[0:1000])
# # load text file
# with open('https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/NotesfromtheUnderground_Dostoevsky.txt', 'r') as f:
# file_data = f.read()
# text_file = open("https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/NotesfromtheUnderground_Dostoevsky.txt", "r")
# lines = raw_text.readlines()
# lines = raw_text.readlines()
# text_file.close()
# df = pd.read_txt('https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/NotesfromtheUnderground_Dostoevsky.txt')
# # view text data
# print(lines)
text = raw_text
text = re.sub(r'\[[0-9]*\]',' ',text) # replace reference number i.e. [1], [10], [20] with empty space, if any..
text = re.sub(r'\s+',' ',text) # replace one or more spaces with single space
#print(text)
Next, we form a clean text with lower case (without special characters, digits and extra spaces) and split it into individual word, for word score computation and formation of the word histogram.
The reason to form a clean text is so that the algorithm won't treat, i.e. "understanding" and understanding, as two different words.
# generate clean text
clean_text = text.lower() # convert all uppercase characters into lowercase characters
clean_text = re.sub(r'\W',' ',clean_text) # replace character other than [a-zA-Z0-9] with empty space
clean_text = re.sub(r'\d',' ',clean_text) # replace digit with empty space
clean_text = re.sub(r'\s+',' ',clean_text) # replace one or more spaces with a single space
#print(clean_text)
We split (tokenize) the text into sentences using NLTK sent_tokenize() method. We will evaluate the importance of each of sentences, then decide if we should each include in our summary.
# split (tokenize) the sentences
sentences = nltk.sent_tokenize(text)
#print(sentences)
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
Remove stop words
Stop words are English words which do not add much meaning to a sentence. They can be safely ignored without sacrificing the meaning of the sentence. We already downloaded a file with English stop words in the first section of the notebook.
Here, we will get the list of stop words and store them in stop_word variable.
# get stop words list
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)
# create an empty dictionary to house the word count
word_count = {}
# loop through tokenized words, remove stop words and save word count to dictionary
for word in nltk.word_tokenize(clean_text):
# remove stop words
if word not in stop_words:
# save word count to dictionary
if word not in word_count.keys():
word_count[word] = 1
else:
word_count[word] += 1
Let's plot the word histogram and see the results.
# plt.figure(figsize=(16,10))
# plt.xticks(rotation = 90)
# plt.bar(word_count.keys(), word_count.values())
# plt.show()
def plot_top_words(word_count_dict, show_top_n=20):
word_count_table = pd.DataFrame.from_dict(word_count_dict, orient = 'index').rename(columns={0: 'score'})
word_count_table.sort_values(by='score').tail(show_top_n).plot(kind='barh', figsize=(10,10))
plt.show()
plot_top_words(word_count, 20)
sentence_score = {}
# loop through tokenized sentence, only take sentences that have less than 30 words, then add word score to form sentence score
for sentence in sentences:
# check if word in sentence is in word_count dictionary
for word in nltk.word_tokenize(sentence.lower()):
if word in word_count.keys():
# only take sentence that has less than 30 words
if len(sentence.split(' ')) < 30:
# add word score to sentence score
if sentence not in sentence_score.keys():
sentence_score[sentence] = word_count[word]
else:
sentence_score[sentence] += word_count[word]
df_sentence_score = pd.DataFrame.from_dict(sentence_score, orient = 'index').rename(columns={0: 'score'})
df_sentence_score.sort_values(by='score', ascending = False)
best_sentences = heapq.nlargest(3, sentence_score, key=sentence_score.get)
print('SUMMARY')
print('------------------------')
for sentence in sentences:
if sentence in best_sentences:
print (sentence)