0% found this document useful (0 votes)
25 views

Text Mining Basics

The document discusses various text mining basics including text extraction from resources, tokenization and dropping stopwords, stemming and lemmatization, and using wordnet and corpora. It provides code examples in Python to demonstrate these text mining techniques.

Uploaded by

adalina
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views

Text Mining Basics

The document discusses various text mining basics including text extraction from resources, tokenization and dropping stopwords, stemming and lemmatization, and using wordnet and corpora. It provides code examples in Python to demonstrate these text mining techniques.

Uploaded by

adalina
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 16

text-mining-basics

December 31, 2022

1 Text Mining
[1]: import pandas
import nltk
from nltk.corpus import stopwords

[4]: print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
# Text Extraction from Resources

[5]: path= r"C:\Users\Asus\Desktop\the idiot.txt"

[6]: with open(path, 'r') as myfile:


text=myfile.read()

[7]: import requests as rq

1
text = rq.get("the_matrix_synopsis.txt").text

[8]: import urllib.request as ur


scrabed_web = ur.urlopen('https://faculty.elgin.edu/jputz/SampleTextPage.html')
news = scrabed_web.read()

[9]: import pandas as pd


comments = pd.read_excel("Financial Sample.xlsx")

[10]: print(comments.head(2))

Segment Country Product Discount Band Units Sold \

0 Government Canada Carretera None 1618.5

1 Government Germany Carretera None 1321.0

Manufacturing Price Sale Price Gross Sales Discounts Sales COGS \

0 3 20 32370.0 0.0 32370.0 16185.0

1 3 20 26420.0 0.0 26420.0 13210.0

Profit Date Month Number Month Name Year

0 16185.0 2014-01-01 1 January 2014

1 13210.0 2014-01-01 1 January 2014


# Tokenization and Dropping Stopwords

[1]: from nltk.corpus import stopwords


from nltk.tokenize import word_tokenize, sent_tokenize

[2]: path= "the idiot.txt"


with open(path, 'r') as myfile:
text=myfile.read()

[8]: en_stops = stopwords.words('english')


print(en_stops)
print('=' * 127)
print(len(en_stops))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',

2
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]

================================================================================
===============================================

179

[14]: data_tokenize = word_tokenize(text)


# print(data_tokenize)

[10]: filted_words = []
for word in data_tokenize:
if word not in en_stops:
filted_words.append(word)
print(filted_words)

['The', 'Project', 'Gutenberg', 'eBook', 'The', 'Idiot', ',', 'Fyodor',


'Dostoyevsky', 'This', 'eBook', 'use', 'anyone', 'anywhere', 'United', 'States',
'parts', 'world', 'cost', 'almost', 'restrictions', 'whatsoever', '.', 'You',
'may', 'copy', ',', 'give', 'away', 're-use', 'terms', 'Project', 'Gutenberg',
'License', 'included', 'eBook', 'online', 'www.gutenberg.org', '.', 'If',
'located', 'United', 'States', ',', 'check', 'laws', 'country', 'located',
'using', 'eBook', '.', 'Title', ':', 'The', 'Idiot', 'Author', ':', 'Fyodor',
'Dostoyevsky', 'Translator', ':', 'Eva', 'Martin', 'Release', 'Date', ':',
'May', ',', '2001', '[', 'eBook', '#', '2638', ']', '[', 'Most', 'recently',
'updated', ':', 'June', '21', ',', '2021', ']', 'Language', ':', 'English',
'Character', 'set', 'encoding', ':', 'UTF-8', 'Produced', ':', 'Martin',
'Adamson', ',', 'David', 'Widger', ',', 'corrections', 'Andrew', 'Sly', '*',
'*', '*', 'START', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE', 'IDIOT',
'*', '*', '*', 'The', 'Idiot', 'Towards', 'end', 'November', ',', 'thaw', ',',
'nine', '’', 'clock', 'one', 'morning', ',', 'train', 'Warsaw', 'Petersburg',

3
'railway', 'approaching', 'latter', 'city', 'full', 'speed', '.', 'The',
'morning', 'damp', 'misty', 'great', 'difficulty', 'day', 'succeeded',
'breaking', ';', 'impossible', 'distinguish', 'anything', 'yards', 'away',
'carriage', 'windows', '.', 'The', 'wearer', 'cloak', 'young', 'fellow', ',',
'also', 'twenty-six', 'twenty-seven', 'years', 'age', ',', 'slightly', 'middle',
'height', ',', 'fair', ',', 'thin', ',', 'pointed', 'light', 'coloured',
'beard', ';', 'eyes', 'large', 'blue', ',', 'intent', 'look', ',', 'yet',
'heavy', 'expression', 'people', 'affirm', 'peculiarity', 'well', 'evidence',
',', 'epileptic', 'subject', '.', 'His', 'face', 'decidedly', 'pleasant', 'one',
';', 'refined', ',', 'quite', 'colourless', ',', 'except', 'circumstance',
'moment', 'blue', 'cold', '.', 'He', 'held', 'bundle', 'made', 'old', 'faded',
'silk', 'handkerchief', 'apparently', 'contained', 'travelling', 'wardrobe',
',', 'wore', 'thick', 'shoes', 'gaiters', ',', 'whole', 'appearance', 'un-
Russian', '.', '“', 'Wheugh', '!', 'goodness', '!', '”', 'The', 'black-haired',
'young', 'fellow', 'whistled', ',', 'laughed', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'four',
'years', ';', 'sent', 'abroad', 'health', ';', 'suffered', 'strange', 'nervous',
'malady—a', 'kind', 'epilepsy', ',', 'convulsive', 'spasms', '.', 'His',
'interlocutor', 'burst', 'laughing', 'several', 'times', 'answers', ';', 'ever',
',', 'question', ',', '“', 'whether', 'cured', '?', '”', 'patient', 'replied',
':', '*', '*', '*', 'END', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE',
'IDIOT', '*', '*', '*', 'He', 'seemed', 'suspicion', 'impertinence',
'inappropriateness', 'fact', 'questions', 'put', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'mo']

[15]: filted_words = " "


for word in data_tokenize:
if word not in en_stops:
filted_words= filted_words + word + " "
# print(filted_words)

[12]: print(len(filted_words))

1978

[16]: # print(sent_tokenize(text))

# Corpus & Word Net

[19]: from nltk.corpus import gutenberg as gt


print(gt.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-


kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt',
'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-
thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-
paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-
macbeth.txt', 'whitman-leaves.txt']

4
[20]: print(gt.words('austen-emma.txt'))

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', …]


# Stemming & Lemmatization

[21]: from nltk.stem import PorterStemmer


from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
text = "programmers program with a programming algorithm languages"
text1 = "there are several books"

[22]: text1 = word_tokenize(text1)


for word in text1:
print(stemmer.stem(word))

there

are

sever

book

[23]: # Lemmatization

text = "programmers program with a programming algorithm languages"


text1 = "there are several books"

from nltk.stem import WordNetLemmatizer


from nltk.tokenize import word_tokenize

lemmi = WordNetLemmatizer()

[24]: text1 = word_tokenize(text1)


for word in text1:
print(lemmi.lemmatize(word))

there

are

several

book
# Pos Tagging
• tagging prior to lemmatization increases accuracy

5
[26]: import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

path= r"C:\Users\Asus\Desktop\Data Science\Other\2- Text Mining-Rabti\the idiot.


,→txt"

with open(path, 'r') as myfile:


text=myfile.read()
text = word_tokenize(text)

[27]: print(nltk.pos_tag(text))

[('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'NN'),


('of', 'IN'), ('The', 'DT'), ('Idiot', 'NNP'), (',', ','), ('by', 'IN'),
('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('This', 'DT'), ('eBook', 'NN'),
('is', 'VBZ'), ('for', 'IN'), ('the', 'DT'), ('use', 'NN'), ('of', 'IN'),
('anyone', 'NN'), ('anywhere', 'RB'), ('in', 'IN'), ('the', 'DT'), ('United',
'NNP'), ('States', 'NNPS'), ('and', 'CC'), ('most', 'JJS'), ('other', 'JJ'),
('parts', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ('at', 'IN'),
('no', 'DT'), ('cost', 'NN'), ('and', 'CC'), ('with', 'IN'), ('almost', 'RB'),
('no', 'DT'), ('restrictions', 'NNS'), ('whatsoever', 'RB'), ('.', '.'), ('You',
'PRP'), ('may', 'MD'), ('copy', 'VB'), ('it', 'PRP'), (',', ','), ('give',
'VB'), ('it', 'PRP'), ('away', 'RB'), ('or', 'CC'), ('re-use', 'VB'), ('it',
'PRP'), ('under', 'IN'), ('the', 'DT'), ('terms', 'NNS'), ('of', 'IN'), ('the',
'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('License', 'NNP'),
('included', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('eBook', 'NN'), ('or',
'CC'), ('online', 'NN'), ('at', 'IN'), ('www.gutenberg.org', 'NN'), ('.', '.'),
('If', 'IN'), ('you', 'PRP'), ('are', 'VBP'), ('not', 'RB'), ('located', 'VBN'),
('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), (',', ','),
('you', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('to', 'TO'), ('check', 'VB'),
('the', 'DT'), ('laws', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('country', 'NN'),
('where', 'WRB'), ('you', 'PRP'), ('are', 'VBP'), ('located', 'VBN'), ('before',
'IN'), ('using', 'VBG'), ('this', 'DT'), ('eBook', 'NN'), ('.', '.'), ('Title',
'NN'), (':', ':'), ('The', 'DT'), ('Idiot', 'NNP'), ('Author', 'NNP'), (':',
':'), ('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('Translator', 'NNP'), (':',
':'), ('Eva', 'NNP'), ('Martin', 'NNP'), ('Release', 'NNP'), ('Date', 'NNP'),
(':', ':'), ('May', 'NNP'), (',', ','), ('2001', 'CD'), ('[', 'NNP'), ('eBook',
'NN'), ('#', '#'), ('2638', 'CD'), (']', 'NNP'), ('[', 'NNP'), ('Most', 'NNP'),
('recently', 'RB'), ('updated', 'VBD'), (':', ':'), ('June', 'NNP'), ('21',
'CD'), (',', ','), ('2021', 'CD'), (']', 'JJ'), ('Language', 'NNP'), (':', ':'),
('English', 'JJ'), ('Character', 'NNP'), ('set', 'VBD'), ('encoding', 'VBG'),
(':', ':'), ('UTF-8', 'NN'), ('Produced', 'VBN'), ('by', 'IN'), (':', ':'),
('Martin', 'NNP'), ('Adamson', 'NNP'), (',', ','), ('David', 'NNP'), ('Widger',
'NNP'), (',', ','), ('with', 'IN'), ('corrections', 'NNS'), ('by', 'IN'),
('Andrew', 'NNP'), ('Sly', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('START', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('PROJECT', 'NNP'),
('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'), ('THE', 'NNP'), ('IDIOT', 'NNP'), ('*',
'NNP'), ('*', 'NNP'), ('*', 'VBD'), ('The', 'DT'), ('Idiot', 'NNP'), ('Towards',
'NNP'), ('the', 'DT'), ('end', 'NN'), ('of', 'IN'), ('November', 'NNP'), (',',

6
','), ('during', 'IN'), ('a', 'DT'), ('thaw', 'NN'), (',', ','), ('at', 'IN'),
('nine', 'CD'), ('o', 'JJ'), ('’', 'FW'), ('clock', 'NN'), ('one', 'CD'),
('morning', 'NN'), (',', ','), ('a', 'DT'), ('train', 'NN'), ('on', 'IN'),
('the', 'DT'), ('Warsaw', 'NNP'), ('and', 'CC'), ('Petersburg', 'NNP'),
('railway', 'NN'), ('was', 'VBD'), ('approaching', 'VBG'), ('the', 'DT'),
('latter', 'JJ'), ('city', 'NN'), ('at', 'IN'), ('full', 'JJ'), ('speed', 'NN'),
('.', '.'), ('The', 'DT'), ('morning', 'NN'), ('was', 'VBD'), ('so', 'RB'),
('damp', 'JJ'), ('and', 'CC'), ('misty', 'VBZ'), ('that', 'IN'), ('it', 'PRP'),
('was', 'VBD'), ('only', 'RB'), ('with', 'IN'), ('great', 'JJ'), ('difficulty',
'NN'), ('that', 'IN'), ('the', 'DT'), ('day', 'NN'), ('succeeded', 'VBD'),
('in', 'IN'), ('breaking', 'NN'), (';', ':'), ('and', 'CC'), ('it', 'PRP'),
('was', 'VBD'), ('impossible', 'JJ'), ('to', 'TO'), ('distinguish', 'VB'),
('anything', 'NN'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('few', 'JJ'),
('yards', 'NNS'), ('away', 'RB'), ('from', 'IN'), ('the', 'DT'), ('carriage',
'NN'), ('windows', 'VBZ'), ('.', '.'), ('The', 'DT'), ('wearer', 'NN'), ('of',
'IN'), ('this', 'DT'), ('cloak', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('young',
'JJ'), ('fellow', 'NN'), (',', ','), ('also', 'RB'), ('of', 'IN'), ('about',
'IN'), ('twenty-six', 'JJ'), ('or', 'CC'), ('twenty-seven', 'JJ'), ('years',
'NNS'), ('of', 'IN'), ('age', 'NN'), (',', ','), ('slightly', 'RB'), ('above',
'IN'), ('the', 'DT'), ('middle', 'JJ'), ('height', 'NN'), (',', ','), ('very',
'RB'), ('fair', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('thin', 'JJ'),
(',', ','), ('pointed', 'JJ'), ('and', 'CC'), ('very', 'RB'), ('light', 'JJ'),
('coloured', 'VBN'), ('beard', 'NN'), (';', ':'), ('his', 'PRP$'), ('eyes',
'NNS'), ('were', 'VBD'), ('large', 'JJ'), ('and', 'CC'), ('blue', 'JJ'), (',',
','), ('and', 'CC'), ('had', 'VBD'), ('an', 'DT'), ('intent', 'JJ'), ('look',
'NN'), ('about', 'IN'), ('them', 'PRP'), (',', ','), ('yet', 'RB'), ('that',
'DT'), ('heavy', 'JJ'), ('expression', 'NN'), ('which', 'WDT'), ('some', 'DT'),
('people', 'NNS'), ('affirm', 'VBP'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'),
('peculiarity', 'NN'), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('evidence',
'NN'), (',', ','), ('of', 'IN'), ('an', 'DT'), ('epileptic', 'JJ'), ('subject',
'NN'), ('.', '.'), ('His', 'PRP$'), ('face', 'NN'), ('was', 'VBD'),
('decidedly', 'RB'), ('a', 'DT'), ('pleasant', 'JJ'), ('one', 'NN'), ('for',
'IN'), ('all', 'DT'), ('that', 'DT'), (';', ':'), ('refined', 'VBN'), (',',
','), ('but', 'CC'), ('quite', 'RB'), ('colourless', 'JJ'), (',', ','),
('except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('circumstance', 'NN'), ('that',
'WDT'), ('at', 'IN'), ('this', 'DT'), ('moment', 'NN'), ('it', 'PRP'), ('was',
'VBD'), ('blue', 'JJ'), ('with', 'IN'), ('cold', 'JJ'), ('.', '.'), ('He',
'PRP'), ('held', 'VBD'), ('a', 'DT'), ('bundle', 'NN'), ('made', 'VBD'), ('up',
'RB'), ('of', 'IN'), ('an', 'DT'), ('old', 'JJ'), ('faded', 'JJ'), ('silk',
'NN'), ('handkerchief', 'NN'), ('that', 'WDT'), ('apparently', 'RB'),
('contained', 'VBD'), ('all', 'DT'), ('his', 'PRP$'), ('travelling', 'NN'),
('wardrobe', 'NN'), (',', ','), ('and', 'CC'), ('wore', 'VBD'), ('thick', 'NN'),
('shoes', 'NNS'), ('and', 'CC'), ('gaiters', 'NNS'), (',', ','), ('his',
'PRP$'), ('whole', 'JJ'), ('appearance', 'NN'), ('being', 'VBG'), ('very',
'RB'), ('un-Russian', 'JJ'), ('.', '.'), ('“', 'CC'), ('Wheugh', 'IN'), ('!',
'.'), ('my', 'PRP$'), ('goodness', 'NN'), ('!', '.'), ('”', 'VB'), ('The',
'DT'), ('black-haired', 'JJ'), ('young', 'JJ'), ('fellow', 'NN'), ('whistled',
'VBD'), (',', ','), ('and', 'CC'), ('then', 'RB'), ('laughed', 'VBD'), ('.',

7
'.'), ('Replying', 'VBG'), ('to', 'TO'), ('them', 'PRP'), (',', ','), ('he',
'PRP'), ('made', 'VBD'), ('known', 'VBN'), ('to', 'TO'), ('the', 'DT'),
('inquirer', 'NN'), ('that', 'IN'), ('he', 'PRP'), ('certainly', 'RB'), ('had',
'VBD'), ('been', 'VBN'), ('long', 'RB'), ('absent', 'JJ'), ('from', 'IN'),
('Russia', 'NNP'), (',', ','), ('more', 'JJR'), ('than', 'IN'), ('four', 'CD'),
('years', 'NNS'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had', 'VBD'),
('been', 'VBN'), ('sent', 'VBN'), ('abroad', 'RB'), ('for', 'IN'), ('his',
'PRP$'), ('health', 'NN'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had',
'VBD'), ('suffered', 'VBN'), ('from', 'IN'), ('some', 'DT'), ('strange', 'JJ'),
('nervous', 'JJ'), ('malady—a', 'NN'), ('kind', 'NN'), ('of', 'IN'),
('epilepsy', 'NN'), (',', ','), ('with', 'IN'), ('convulsive', 'JJ'), ('spasms',
'NNS'), ('.', '.'), ('His', 'PRP$'), ('interlocutor', 'NN'), ('burst', 'NN'),
('out', 'IN'), ('laughing', 'VBG'), ('several', 'JJ'), ('times', 'NNS'), ('at',
'IN'), ('his', 'PRP$'), ('answers', 'NNS'), (';', ':'), ('and', 'CC'), ('more',
'RBR'), ('than', 'IN'), ('ever', 'RB'), (',', ','), ('when', 'WRB'), ('to',
'TO'), ('the', 'DT'), ('question', 'NN'), (',', ','), ('“', 'VBZ'), ('whether',
'IN'), ('he', 'PRP'), ('had', 'VBD'), ('been', 'VBN'), ('cured', 'VBN'), ('?',
'.'), ('”', 'VB'), ('the', 'DT'), ('patient', 'NN'), ('replied', 'VBD'), (':',
':'), ('*', 'NN'), ('*', 'VBZ'), ('*', 'JJ'), ('END', 'NN'), ('OF', 'IN'),
('THE', 'NNP'), ('PROJECT', 'NNP'), ('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'),
('THE', 'NNP'), ('IDIOT', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('He', 'PRP'), ('seemed', 'VBD'), ('to', 'TO'), ('have', 'VB'), ('no', 'DT'),
('suspicion', 'NN'), ('of', 'IN'), ('any', 'DT'), ('impertinence', 'NN'), ('or',
'CC'), ('inappropriateness', 'NN'), ('in', 'IN'), ('the', 'DT'), ('fact', 'NN'),
('of', 'IN'), ('such', 'JJ'), ('questions', 'NNS'), ('being', 'VBG'), ('put',
'VBN'), ('to', 'TO'), ('him', 'PRP'), ('.', '.'), ('Replying', 'VBG'), ('to',
'TO'), ('them', 'PRP'), (',', ','), ('he', 'PRP'), ('made', 'VBD'), ('known',
'VBN'), ('to', 'TO'), ('the', 'DT'), ('inquirer', 'NN'), ('that', 'IN'), ('he',
'PRP'), ('certainly', 'RB'), ('had', 'VBD'), ('been', 'VBN'), ('long', 'RB'),
('absent', 'JJ'), ('from', 'IN'), ('Russia', 'NNP'), (',', ','), ('mo', 'NN')]
# Pos Tagging (Name Entity Recognition & Chunking)

[28]: import nltk


from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

news = "European Authorities find Google a record $5.2 billion on Wedensday"

def preprocess(sent):
sent=word_tokenize(sent)
sent=pos_tag(sent)
return sent
sent =preprocess(news)
print(sent)

[('European', 'JJ'), ('Authorities', 'NNP'), ('find', 'VBP'), ('Google', 'NNP'),


('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.2', 'CD'), ('billion', 'CD'),
('on', 'IN'), ('Wedensday', 'NN')]

8
[29]: pattern = 'NP: {<DT>?<JJ>*<NN> | <DT>?<JJ>*<NNS>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S

European/JJ

Authorities/NNP

find/VBP

Google/NNP

(NP a/DT record/NN)

$/$

5.2/CD

billion/CD

on/IN

(NP Wedensday/NN))

[30]: # cs.draw()

[31]: from nltk.corpus import conll2000


print(conll2000.chunked_sents('train.txt') [99])
print(conll2000.chunked_sents('train.txt'))

(S

(PP Over/IN)

(NP a/DT cup/NN)

(PP of/IN)

(NP coffee/NN)

,/,

(NP Mr./NNP Stone/NNP)

(VP told/VBD)

9
(NP his/PRP$ story/NN)

./.)

[Tree('S', [Tree('NP', [('Confidence', 'NN')]), Tree('PP', [('in', 'IN')]),


Tree('NP', [('the', 'DT'), ('pound', 'NN')]), Tree('VP', [('is', 'VBZ'),
('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB')]),
Tree('NP', [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')]), ('if', 'IN'),
Tree('NP', [('trade', 'NN'), ('figures', 'NNS')]), Tree('PP', [('for', 'IN')]),
Tree('NP', [('September', 'NNP')]), (',', ','), ('due', 'JJ'), Tree('PP',
[('for', 'IN')]), Tree('NP', [('release', 'NN')]), Tree('NP', [('tomorrow',
'NN')]), (',', ','), Tree('VP', [('fail', 'VB'), ('to', 'TO'), ('show', 'VB')]),
Tree('NP', [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')]),
Tree('PP', [('from', 'IN')]), Tree('NP', [('July', 'NNP'), ('and', 'CC'),
('August', 'NNP')]), Tree('NP', [("'s", 'POS'), ('near-record', 'JJ'),
('deficits', 'NNS')]), ('.', '.')]), Tree('S', [('Chancellor', 'NNP'),
Tree('PP', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('Exchequer', 'NNP')]),
Tree('NP', [('Nigel', 'NNP'), ('Lawson', 'NNP')]), Tree('NP', [("'s", 'POS'),
('restated', 'VBN'), ('commitment', 'NN')]), Tree('PP', [('to', 'TO')]),
Tree('NP', [('a', 'DT'), ('firm', 'NN'), ('monetary', 'JJ'), ('policy', 'NN')]),
Tree('VP', [('has', 'VBZ'), ('helped', 'VBN'), ('to', 'TO'), ('prevent',
'VB')]), Tree('NP', [('a', 'DT'), ('freefall', 'NN')]), Tree('PP', [('in',
'IN')]), Tree('NP', [('sterling', 'NN')]), Tree('PP', [('over', 'IN')]),
Tree('NP', [('the', 'DT'), ('past', 'JJ'), ('week', 'NN')]), ('.', '.')]), …]
# Ngram

[22]: import nltk


from nltk.tokenize import word_tokenize
from nltk import ngrams
text = "A wonderful serenity has taken possession of my entire soul, like these␣
,→sweet mornings of spring which I enjoy with my whole heart. I am alone, and␣

,→feel the charm of existence in this spot, which was created for the bliss of␣

,→souls like mine. I am so happy, my dear friend, so absorbed in the exquisite␣

,→sense of mere tranquil existence, that I neglect my talents. I should be␣

,→incapable of drawing a single stroke at the present moment; and yet I feel␣

,→that I never was a greater artist than now. When, while the lovely valley␣

,→teems with vapour around me, and the meridian sun strikes the upper surface␣

,→of the impenetrable foliage of my trees, and but a few stray gleams steal␣

,→into the inner sanctuary, I throw myself down among the tall grass by the␣

,→trickling stream; and, as I lie close to the earth, a thousand unknown␣

,→plants are noticed by me: when I hear the buzz of the little world among the␣

,→stalks, and grow familiar with the countless indescribable forms of the␣

,→insects and flies, then I feel the presence of the Almighty, who formed us␣

,→in his own image, and the breath"

10
[23]: from nltk.corpus import stopwords
en_stops = stopwords.words('english')

[24]: def preprocess(sent):


text = word_tokenize(sent)
filtered_words = []
for word in text:
if word.lower() not in en_stops:
filtered_words.append(word)
return filtered_words

text = preprocess(text)
n = 3
threegrams = ngrams(text, n)

for grams in threegrams:


print(grams)

('wonderful', 'serenity', 'taken')

('serenity', 'taken', 'possession')

('taken', 'possession', 'entire')

('possession', 'entire', 'soul')

('entire', 'soul', ',')

('soul', ',', 'like')

(',', 'like', 'sweet')

('like', 'sweet', 'mornings')

('sweet', 'mornings', 'spring')

('mornings', 'spring', 'enjoy')

('spring', 'enjoy', 'whole')

('enjoy', 'whole', 'heart')

('whole', 'heart', '.')

('heart', '.', 'alone')

('.', 'alone', ',')

11
('alone', ',', 'feel')

(',', 'feel', 'charm')

('feel', 'charm', 'existence')

('charm', 'existence', 'spot')

('existence', 'spot', ',')

('spot', ',', 'created')

(',', 'created', 'bliss')

('created', 'bliss', 'souls')

('bliss', 'souls', 'like')

('souls', 'like', 'mine')

('like', 'mine', '.')

('mine', '.', 'happy')

('.', 'happy', ',')

('happy', ',', 'dear')

(',', 'dear', 'friend')

('dear', 'friend', ',')

('friend', ',', 'absorbed')

(',', 'absorbed', 'exquisite')

('absorbed', 'exquisite', 'sense')

('exquisite', 'sense', 'mere')

('sense', 'mere', 'tranquil')

('mere', 'tranquil', 'existence')

('tranquil', 'existence', ',')

('existence', ',', 'neglect')

12
(',', 'neglect', 'talents')

('neglect', 'talents', '.')

('talents', '.', 'incapable')

('.', 'incapable', 'drawing')

('incapable', 'drawing', 'single')

('drawing', 'single', 'stroke')

('single', 'stroke', 'present')

('stroke', 'present', 'moment')

('present', 'moment', ';')

('moment', ';', 'yet')

(';', 'yet', 'feel')

('yet', 'feel', 'never')

('feel', 'never', 'greater')

('never', 'greater', 'artist')

('greater', 'artist', '.')

('artist', '.', ',')

('.', ',', 'lovely')

(',', 'lovely', 'valley')

('lovely', 'valley', 'teems')

('valley', 'teems', 'vapour')

('teems', 'vapour', 'around')

('vapour', 'around', ',')

('around', ',', 'meridian')

(',', 'meridian', 'sun')

13
('meridian', 'sun', 'strikes')

('sun', 'strikes', 'upper')

('strikes', 'upper', 'surface')

('upper', 'surface', 'impenetrable')

('surface', 'impenetrable', 'foliage')

('impenetrable', 'foliage', 'trees')

('foliage', 'trees', ',')

('trees', ',', 'stray')

(',', 'stray', 'gleams')

('stray', 'gleams', 'steal')

('gleams', 'steal', 'inner')

('steal', 'inner', 'sanctuary')

('inner', 'sanctuary', ',')

('sanctuary', ',', 'throw')

(',', 'throw', 'among')

('throw', 'among', 'tall')

('among', 'tall', 'grass')

('tall', 'grass', 'trickling')

('grass', 'trickling', 'stream')

('trickling', 'stream', ';')

('stream', ';', ',')

(';', ',', 'lie')

(',', 'lie', 'close')

('lie', 'close', 'earth')

14
('close', 'earth', ',')

('earth', ',', 'thousand')

(',', 'thousand', 'unknown')

('thousand', 'unknown', 'plants')

('unknown', 'plants', 'noticed')

('plants', 'noticed', ':')

('noticed', ':', 'hear')

(':', 'hear', 'buzz')

('hear', 'buzz', 'little')

('buzz', 'little', 'world')

('little', 'world', 'among')

('world', 'among', 'stalks')

('among', 'stalks', ',')

('stalks', ',', 'grow')

(',', 'grow', 'familiar')

('grow', 'familiar', 'countless')

('familiar', 'countless', 'indescribable')

('countless', 'indescribable', 'forms')

('indescribable', 'forms', 'insects')

('forms', 'insects', 'flies')

('insects', 'flies', ',')

('flies', ',', 'feel')

(',', 'feel', 'presence')

('feel', 'presence', 'Almighty')

15
('presence', 'Almighty', ',')

('Almighty', ',', 'formed')

(',', 'formed', 'us')

('formed', 'us', 'image')

('us', 'image', ',')

('image', ',', 'breath')

16

You might also like