Using Dask with dask.bag and regex to parse Notes from the Underground from project gutenberg
This post includes code from Scalable-Data-Analysis-in-Python-with-Dask and coiled-examples.
import dask.bag as db
import re
book_bag = db.from_url('https://www.gutenberg.org/cache/epub/600/pg600.txt')
book_bag.take(5)
remove_spaces = book_bag.map(lambda x:x.strip())
remove_spaces.take(10)
def decode_to_ascii(x):
return x.decode("ascii","ignore")
ascii_text = remove_spaces.map(decode_to_ascii)
ascii_text.take(10)
def remove_punctuation(x):
return re.sub(r'[^\w\s]','',x)
remove_punctuation = ascii_text.map(remove_punctuation)
remove_punctuation.take(10)
lower_text = remove_punctuation.map(str.lower)
lower_text.take(10)
split_word_list = lower_text.map(lambda x: x.split(' '))
split_word_list.take(10)
def remove_empty_words(word_list):
return list(filter(lambda a: a != '', word_list))
non_empty_words = split_word_list.filter(remove_empty_words)
non_empty_words.take(10)
all_words = non_empty_words.flatten()
type(all_words)
all_words.take(30)
change_to_key_value = all_words.map(lambda x: (x, 1))
change_to_key_value.take(4)
grouped_words = all_words.groupby(lambda x:x)
grouped_words.take(1)
word_count = grouped_words.map(lambda x: (x[0], len(x[1])))
word_count.take(10)
change_to_key_value.take(10)
# Take a running count of a word
# In this case, the default value of
# count needs to be provided
def add_bin_op(count, x):
return count + x[1]
# Take the output from multiple bin_op(s)
# and add them to get the total count of
# a word
def add_combine_op(x, y):
return x + y
word_count = change_to_key_value.foldby(lambda x: x[0],
add_bin_op, 0,
add_combine_op)
word_count.take(10)
much_easier = all_words.frequencies()
much_easier.take(10)
from spacy.lang.en import STOP_WORDS
without_stopwords = all_words.filter(lambda x: x not in STOP_WORDS)
new_freq = without_stopwords.frequencies()
new_freq.take(20)
new_freq.topk(10)
new_freq.topk(10, key=lambda x: x[1]).compute()