NLP ngrams With Python
'In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.', from wikipedia
import pandas as pd
df=pd.read_csv('../../processed_data/nf_complete.csv')
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text = re.sub('[\W]+', ' ', text.lower()) +\
' '.join(emoticons).replace('-', '')
return text
text = " ".join(review for review in df.abstract)
print ("There are {} words in the combination of all abstracts.".format(len(text)))
from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum = 0
for word, value in wordList.items():
sum += value
return sum
def retrieveRandomWord(wordList):
randIndex = randint(1, wordListSum(wordList))
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word
def buildWordDict(text):
# Remove newlines and quotes
text = text.replace('\n', ' ');
text = text.replace('"', '');
# Make sure punctuation marks are treated as their own "words,"
# so that they will be included in the Markov chain
punctuation = [',','.',';',':']
for symbol in punctuation:
text = text.replace(symbol, ' {} '.format(symbol));
words = text.split(' ')
# Filter out empty words
words = [word for word in words if word != '']
wordDict = {}
for i in range(1, len(words)):
if words[i-1] not in wordDict:
# Create a new dictionary for this word
wordDict[words[i-1]] = {}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]] += 1
return wordDict
wordDict = buildWordDict(text)
#Generate a Markov chain of length 100
length = 100
chain = ['Vietnam']
for i in range(0, length):
newWord = retrieveRandomWord(wordDict[chain[-1]])
chain.append(newWord)
print(' '.join(chain))
def getFirstSentenceContaining(ngram, text):
#print(ngram)
sentences = text.upper().split(". ")
for sentence in sentences:
if ngram in sentence:
return sentence+'\n'
return ""
print(getFirstSentenceContaining('I', text))
#text
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
def cleanSentence(sentence):
sentence = sentence.split(' ')
sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
return sentence
def cleanInput(content):
content = content.upper()
content = re.sub('\n', ' ', content)
content = bytes(content, 'UTF-8')
content = content.decode('ascii', 'ignore')
sentences = content.split('. ')
return [cleanSentence(sentence) for sentence in sentences]
def getNgramsFromSentence(content, n):
output = []
for i in range(len(content)-n+1):
output.append(content[i:i+n])
return output
def getNgrams(content, n):
content = cleanInput(content)
ngrams = Counter()
ngrams_list = []
for sentence in content:
newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
ngrams_list.extend(newNgrams)
ngrams.update(newNgrams)
return(ngrams)
content = str(text)
ngrams = getNgrams(content, 3)
#print(ngrams)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
def isCommon(ngram):
commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
for word in ngram:
if word in commonWords:
return True
return False
def getNgramsFromSentence(content, n):
output = []
for i in range(len(content)-n+1):
if not isCommon(content[i:i+n]):
output.append(content[i:i+n])
return output
ngrams = getNgrams(content, 3)
#print(ngrams)
def getFirstSentenceContaining(ngram, content):
#print(ngram)
sentences = content.upper().split(". ")
for sentence in sentences:
if ngram in sentence:
return sentence+'\n'
return ""
print(getFirstSentenceContaining('SINO-JAPANESE WAR 1894-1895', content))
print(getFirstSentenceContaining('2ND VIETNAM WAR', content))
print(getFirstSentenceContaining('COLD WAR ARMY', content))
print(getFirstSentenceContaining('WORLD WAR II', content))
print(getFirstSentenceContaining('ARMS CONTROL AGREEMENTS', content))
from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum = 0
for word, value in wordList.items():
sum += value
return sum
def retrieveRandomWord(wordList):
randIndex = randint(1, wordListSum(wordList))
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word
def buildWordDict(text):
# Remove newlines and quotes
text = text.replace('\n', ' ');
text = text.replace('"', '');
# Make sure punctuation marks are treated as their own "words,"
# so that they will be included in the Markov chain
punctuation = [',','.',';',':']
for symbol in punctuation:
text = text.replace(symbol, ' {} '.format(symbol));
words = text.split(' ')
# Filter out empty words
words = [word for word in words if word != '']
wordDict = {}
for i in range(1, len(words)):
if words[i-1] not in wordDict:
# Create a new dictionary for this word
wordDict[words[i-1]] = {}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]] += 1
return wordDict
wordDict = buildWordDict(text)
#Generate a Markov chain of length 100
length = 100
chain = ['I']
for i in range(0, length):
newWord = retrieveRandomWord(wordDict[chain[-1]])
chain.append(newWord)
#print(' '.join(chain))
import re
def getNgrams(content, n):
content = re.sub('\n|[[\d+\]]', ' ', content)
content = bytes(content, 'UTF-8')
content = content.decode('ascii', 'ignore')
content = content.split(' ')
content = [word for word in content if word != '']
output = []
for i in range(len(content)-n+1):
output.append(content[i:i+n])
return output
from collections import Counter
def getNgrams(content, n):
content = cleanInput(content)
ngrams = Counter()
ngrams_list = []
for sentence in content:
newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
ngrams_list.extend(newNgrams)
ngrams.update(newNgrams)
return(ngrams)
#print(getNgrams(content, 2))
def isCommon(ngram):
commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
for word in ngram:
if word in commonWords:
return True
return False
def getNgramsFromSentence(text, n):
output = []
for i in range(len(text)-n+1):
if not isCommon(text[i:i+n]):
output.append(text[i:i+n])
return output
ngrams = getNgrams(text, 3)
#print(ngrams)