import dask.bag as db
import re
book_bag = db.from_url('https://www.gutenberg.org/files/28054/28054-0.txt')
book_bag.take(5)
(b'\xef\xbb\xbfThe Project Gutenberg EBook of The Brothers Karamazov by Fyodor\r\n',
 b'Dostoyevsky\r\n',
 b'\r\n',
 b'\r\n',
 b'\r\n')
remove_spaces = book_bag.map(lambda x:x.strip())
remove_spaces.take(10)
(b'\xef\xbb\xbfThe Project Gutenberg EBook of The Brothers Karamazov by Fyodor',
 b'Dostoyevsky',
 b'',
 b'',
 b'',
 b'This ebook is for the use of anyone anywhere in the United States and most',
 b'other parts of the world at no cost and with almost no restrictions',
 b'whatsoever. You may copy it, give it away or re\xe2\x80\x90use it under the terms of',
 b'the Project Gutenberg License included with this eBook or online at',
 b'http://www.gutenberg.org/license. If you are not located in the United')
def decode_to_ascii(x):
    return x.decode("ascii","ignore") 
ascii_text = remove_spaces.map(decode_to_ascii)
ascii_text.take(10)
('The Project Gutenberg EBook of The Brothers Karamazov by Fyodor',
 'Dostoyevsky',
 '',
 '',
 '',
 'This ebook is for the use of anyone anywhere in the United States and most',
 'other parts of the world at no cost and with almost no restrictions',
 'whatsoever. You may copy it, give it away or reuse it under the terms of',
 'the Project Gutenberg License included with this eBook or online at',
 'http://www.gutenberg.org/license. If you are not located in the United')
def remove_punctuation(x):
    return re.sub(r'[^\w\s]','',x)
remove_punctuation = ascii_text.map(remove_punctuation)
remove_punctuation.take(10)
('The Project Gutenberg EBook of The Brothers Karamazov by Fyodor',
 'Dostoyevsky',
 '',
 '',
 '',
 'This ebook is for the use of anyone anywhere in the United States and most',
 'other parts of the world at no cost and with almost no restrictions',
 'whatsoever You may copy it give it away or reuse it under the terms of',
 'the Project Gutenberg License included with this eBook or online at',
 'httpwwwgutenbergorglicense If you are not located in the United')
lower_text = remove_punctuation.map(str.lower)
lower_text.take(10)
('the project gutenberg ebook of the brothers karamazov by fyodor',
 'dostoyevsky',
 '',
 '',
 '',
 'this ebook is for the use of anyone anywhere in the united states and most',
 'other parts of the world at no cost and with almost no restrictions',
 'whatsoever you may copy it give it away or reuse it under the terms of',
 'the project gutenberg license included with this ebook or online at',
 'httpwwwgutenbergorglicense if you are not located in the united')
split_word_list = lower_text.map(lambda x: x.split(' '))
split_word_list.take(10)
(['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'brothers',
  'karamazov',
  'by',
  'fyodor'],
 ['dostoyevsky'],
 [''],
 [''],
 [''],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'in',
  'the',
  'united',
  'states',
  'and',
  'most'],
 ['other',
  'parts',
  'of',
  'the',
  'world',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions'],
 ['whatsoever',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or',
  'reuse',
  'it',
  'under',
  'the',
  'terms',
  'of'],
 ['the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at'],
 ['httpwwwgutenbergorglicense',
  'if',
  'you',
  'are',
  'not',
  'located',
  'in',
  'the',
  'united'])
def remove_empty_words(word_list):
    return list(filter(lambda a: a != '', word_list))

non_empty_words = split_word_list.filter(remove_empty_words)
non_empty_words.take(10)
(['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'brothers',
  'karamazov',
  'by',
  'fyodor'],
 ['dostoyevsky'],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'in',
  'the',
  'united',
  'states',
  'and',
  'most'],
 ['other',
  'parts',
  'of',
  'the',
  'world',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions'],
 ['whatsoever',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or',
  'reuse',
  'it',
  'under',
  'the',
  'terms',
  'of'],
 ['the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at'],
 ['httpwwwgutenbergorglicense',
  'if',
  'you',
  'are',
  'not',
  'located',
  'in',
  'the',
  'united'],
 ['states',
  'youll',
  'have',
  'to',
  'check',
  'the',
  'laws',
  'of',
  'the',
  'country',
  'where',
  'you',
  'are',
  'located'],
 ['before', 'using', 'this', 'ebook'],
 ['title', 'the', 'brothers', 'karamazov'])
all_words = non_empty_words.flatten()
type(all_words)
dask.bag.core.Bag
all_words.take(30)
('the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'the',
 'brothers',
 'karamazov',
 'by',
 'fyodor',
 'dostoyevsky',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'in',
 'the',
 'united',
 'states',
 'and',
 'most',
 'other',
 'parts',
 'of',
 'the')
change_to_key_value = all_words.map(lambda x: (x, 1))
change_to_key_value.take(4)
(('the', 1), ('project', 1), ('gutenberg', 1), ('ebook', 1))
grouped_words = all_words.groupby(lambda x:x)
grouped_words.take(1)
(('the',
  ['the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   ...]),)
word_count = grouped_words.map(lambda x: (x[0], len(x[1])))
word_count.take(10)
(('the', 15379),
 ('project', 89),
 ('gutenberg', 88),
 ('ebook', 14),
 ('of', 7410),
 ('brothers', 82),
 ('karamazov', 170),
 ('by', 1165),
 ('fyodor', 303),
 ('dostoyevsky', 3))
change_to_key_value.take(10)
(('the', 1),
 ('project', 1),
 ('gutenberg', 1),
 ('ebook', 1),
 ('of', 1),
 ('the', 1),
 ('brothers', 1),
 ('karamazov', 1),
 ('by', 1),
 ('fyodor', 1))
# Take a running count of a word
# In this case, the default value of 
# count needs to be provided
def add_bin_op(count, x):
    return count + x[1]

# Take the output from multiple bin_op(s)
# and add them to get the total count of
# a word
def add_combine_op(x, y):
    return x + y

word_count = change_to_key_value.foldby(lambda x: x[0],
                                       add_bin_op, 0,
                                       add_combine_op)
word_count.take(10)
(('the', 15379),
 ('project', 89),
 ('gutenberg', 88),
 ('ebook', 14),
 ('of', 7410),
 ('brothers', 82),
 ('karamazov', 170),
 ('by', 1165),
 ('fyodor', 303),
 ('dostoyevsky', 3))
much_easier = all_words.frequencies()
much_easier.take(10)
(('the', 15379),
 ('project', 89),
 ('gutenberg', 88),
 ('ebook', 14),
 ('of', 7410),
 ('brothers', 82),
 ('karamazov', 170),
 ('by', 1165),
 ('fyodor', 303),
 ('dostoyevsky', 3))

Removing stop words in top word frequency counts

from spacy.lang.en import STOP_WORDS
without_stopwords = all_words.filter(lambda x: x not in STOP_WORDS)
new_freq = without_stopwords.frequencies()
new_freq.take(20)
(('project', 89),
 ('gutenberg', 88),
 ('ebook', 14),
 ('brothers', 82),
 ('karamazov', 170),
 ('fyodor', 303),
 ('dostoyevsky', 3),
 ('use', 77),
 ('united', 24),
 ('states', 21),
 ('parts', 19),
 ('world', 182),
 ('cost', 12),
 ('restrictions', 2),
 ('whatsoever', 5),
 ('copy', 16),
 ('away', 445),
 ('reuse', 2),
 ('terms', 33),
 ('license', 14))
new_freq.topk(10)
dask.bag<topk-aggregate, npartitions=1>
new_freq.topk(10, key=lambda x: x[1]).compute()
[('alyosha', 1176),
 ('said', 993),
 ('know', 843),
 ('man', 842),
 ('mitya', 814),
 ('dont', 784),
 ('come', 772),
 ('father', 721),
 ('ivan', 677),
 ('time', 669)]