0% found this document useful (0 votes)

28 views4 pages

123 NLP 456

The document outlines a series of experiments focused on various text preprocessing techniques in natural language processing (NLP). It covers methods such as tokenization, stop word removal, lemmatization, morphological analysis, N-gram modeling, spell correction, part-of-speech tagging, named entity recognition, and supervised classifiers for word sense disambiguation. Each experiment includes code snippets demonstrating the application of these techniques using the NLTK library and other tools.

Uploaded by

rjcsahil12sci30

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

28 views4 pages

123 NLP 456

Uploaded by

rjcsahil12sci30

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

Experiment-01

Apply various text preprocessing techniques for any given text : Tokenization and
Filtration & Script Validation.
import nltk, re, string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

text = "Hello! This is a sample text for NLP preprocessing. नमस्ते!. i am Kajol"

# Tokenization
tokens = word_tokenize(text)
sentences = sent_tokenize(text)

# Stopword & Punctuation Removal

stops = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stops and w not in
string.punctuation]

# Script Validation (Only English)

english_words = [w for w in filtered if re.fullmatch(r'[A-Za-z]+', w)]

print("Tokens:", tokens)
print("Sentences:", sentences)
print("Filtered:", filtered)
print("Script Validation:", english_words)

Experiment-2
Apply various other text preprocessing techniques for any given text : Stop Word
Removal, Lemmatization / Stemming.

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK resources

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The striped bats are hanging on their feet for best."

# Tokenize and filter out stopwords and non-alphabetic words

filtered = [w for w in word_tokenize(text) if w.isalpha() and w.lower() not in
stopwords.words('english')]

# Get POS tags for the filtered words

pos_tags = pos_tag(filtered)

# Stemming
stemmed = [PorterStemmer().stem(w) for w in filtered]
# Lemmatization using POS tags
lemmatized = [WordNetLemmatizer().lemmatize(w, pos={'J': 'a', 'V': 'v', 'N': 'n',
'R': 'r'}.get(tag[0], 'n')) for w, tag in pos_tags]

# Output
print("Filtered:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)

Experiment-03
Perform morphological analysis and word generation for any given text.

import nltk
import string

# Download necessary NLTK resources (only once)

nltk.download('punkt')
nltk.download('punkt_tab')

# Function to extract root and suffix

def extract_root_and_suffix(word):
suffixes = ["ing", "ed", "ly", "ness", "s", "es", "ment", "able", "tion"]
for suffix in suffixes:
if word.endswith(suffix) and len(word) > len(suffix) + 1:
return word[:-len(suffix)], suffix
return word, "None"

# Sample input text

input_text = "I am enjoying the study of happiness."

# Tokenize words
words = nltk.word_tokenize(input_text)

# Process each word

for word in words:
word_cleaned = word.strip(string.punctuation) # Remove punctuation
if word_cleaned:
root, suffix = extract_root_and_suffix(word_cleaned.lower())
print(f"Word: {word_cleaned}")
print(f"Root word: {root}")
print(f"Suffix: {suffix}")
print(f"Length of root word: {len(root)}")
print("=" * 40)

Experiment-04
Implement N-Gram model for sentence probability.

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Sample text
text = "This is a sample sentence for N-gram language modeling."
tokens = word_tokenize(text.lower()) # Tokenize and convert to lowercase

# Generate and compute frequencies for unigrams, bigrams, and trigrams

for n in [1, 2, 3]:
ngrams_list = list(ngrams(tokens, n))
print(f"\n{['Unigrams', 'Bigrams', 'Trigrams'][n-1]} Frequencies:")
for ngram, freq in FreqDist(ngrams_list).items():
print(f"{ngram}")

Experiment-5
Implement edit distance for spell correction

import nltk
from nltk.metrics import edit_distance

def spell_correct(word, dictionary):

"""Returns the closest word from the dictionary based on edit distance."""
candidates = [(w, edit_distance(word, w)) for w in dictionary]
return min(candidates, key=lambda x: x[1])[0]

# Sample dictionary of correct words

dictionary = {
"language", "modeling", "sentence", "sample",
"market", "stock", "computer", "science"
}

# List of misspelled words to correct

misspelled_words = ["langauge", "modling", "sentece", "sampel", "marcket"]

# Run spell correction

for word in misspelled_words:
corrected = spell_correct(word, dictionary)
print(f"Misspelled: {word} -> Corrected: {corrected}")

Experiment-06

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def pos_tagging(text):
"""Performs POS tagging on the given text."""
tokens = word_tokenize(text)
return pos_tag(tokens)

# Example text
text = "This is a sample sentence for POS tagging."

# Perform POS tagging

tagged_words = pos_tagging(text)

# Print results
for word, tag in tagged_words:
print(f"{word}: {tag}")

Experiment-07

from transformers import pipeline

# Load NER pipeline using a pre-trained BERT model

ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-
english", aggregation_strategy="simple")

# Example text
text = "Barack Obama was the 44th President of the United States. He was born in
Hawaii."

# Perform NER
entities = ner_pipeline(text)

# Print the named entities

for entity in entities:
print(f"{entity['word']} --> {entity['entity_group']}")

Experiment-08
To implement supervised classifiers for WSD

import nltk
from nltk.corpus import semcor, wordnet as wn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('semcor')
nltk.download('wordnet')

# Smaller dataset and labels

data, labels = zip(*[(f'{" ".join(s)}', wn.synsets(w)[0].name())
for s in semcor.sents()[:100] for w in s if len(wn.synsets(w))
> 1])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

# Initialize and fit TF-IDF vectorizer

vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model

clf = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)

# Evaluate the model

print(classification_report(y_test, clf.predict(X_test_tfidf)))

NLP Lab Codes Till Mod3
No ratings yet
NLP Lab Codes Till Mod3
7 pages
NLP Record
No ratings yet
NLP Record
23 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
15 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
21 pages
NLP Lab Manual - Final
No ratings yet
NLP Lab Manual - Final
15 pages
NLP Practical Journal for M.Sc. IT
No ratings yet
NLP Practical Journal for M.Sc. IT
22 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
Natural Language Processing
No ratings yet
Natural Language Processing
17 pages
NLP Exp 4
No ratings yet
NLP Exp 4
5 pages
NLP Python Code Examples and Techniques
No ratings yet
NLP Python Code Examples and Techniques
16 pages
NLPEXP3
No ratings yet
NLPEXP3
3 pages
NLP Practical Journal with Python Code
No ratings yet
NLP Practical Journal with Python Code
17 pages
NLP Tokenization and Text Processing Guide
No ratings yet
NLP Tokenization and Text Processing Guide
6 pages
NLP Session 4
No ratings yet
NLP Session 4
13 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
C24064 - NLP - Lab Manual
No ratings yet
C24064 - NLP - Lab Manual
28 pages
7 TextAnalysis
No ratings yet
7 TextAnalysis
3 pages
NLP - (Natural Language Processing Lab Manual)
No ratings yet
NLP - (Natural Language Processing Lab Manual)
12 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
Python NLP Techniques Guide
No ratings yet
Python NLP Techniques Guide
18 pages
Python Text Processing Techniques
No ratings yet
Python Text Processing Techniques
13 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
5 pages
NLP Programs for Text Processing
No ratings yet
NLP Programs for Text Processing
12 pages
Python NLP Techniques and Examples
No ratings yet
Python NLP Techniques and Examples
16 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
17 pages
AIPT LAB 24-25 MANUAL EXPE 4 To8
No ratings yet
AIPT LAB 24-25 MANUAL EXPE 4 To8
15 pages
NLP Techniques for Students
No ratings yet
NLP Techniques for Students
55 pages
NLTK Data Preprocessing in Python
No ratings yet
NLTK Data Preprocessing in Python
12 pages
NLP Lab File
No ratings yet
NLP Lab File
13 pages
NLP Manual Final
No ratings yet
NLP Manual Final
22 pages
NLP Tokenization, Stemming, Lemmatization Guide
No ratings yet
NLP Tokenization, Stemming, Lemmatization Guide
29 pages
TSA Lab Manual New
No ratings yet
TSA Lab Manual New
14 pages
DBMS
No ratings yet
DBMS
2 pages
NLP - Exp 1 11
No ratings yet
NLP - Exp 1 11
29 pages
How to Install and Use NLTK in Python
No ratings yet
How to Install and Use NLTK in Python
15 pages
NLP Practical Record for M.Sc. Students
No ratings yet
NLP Practical Record for M.Sc. Students
33 pages
Experiment 2
No ratings yet
Experiment 2
4 pages
NLP Lab File
No ratings yet
NLP Lab File
15 pages
NLP Practical Lab Manual for CSE Students
No ratings yet
NLP Practical Lab Manual for CSE Students
32 pages
NLP Lab Manual for CSE Students
No ratings yet
NLP Lab Manual for CSE Students
28 pages
NLP Lab File
No ratings yet
NLP Lab File
13 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
19 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
32 pages
Natural Language Processing Practical Journal
No ratings yet
Natural Language Processing Practical Journal
27 pages
NLP Techniques: Tokenization to LSTM
No ratings yet
NLP Techniques: Tokenization to LSTM
18 pages
NLP Techniques in Machine Learning Lab
No ratings yet
NLP Techniques in Machine Learning Lab
4 pages
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
No ratings yet
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
42 pages
NLP Lab Manual (R20)
50% (2)
NLP Lab Manual (R20)
24 pages
NLTK Tutorial: Basics and Techniques
No ratings yet
NLTK Tutorial: Basics and Techniques
33 pages
NLP Applications and Text Preprocessing
No ratings yet
NLP Applications and Text Preprocessing
54 pages
NLP Lab Complete
No ratings yet
NLP Lab Complete
23 pages
7 Exp
No ratings yet
7 Exp
6 pages
NLP Lab 2
No ratings yet
NLP Lab 2
6 pages
H7 W5 NLP - Merged
No ratings yet
H7 W5 NLP - Merged
17 pages
NLP Lab Manual for CSE Students
No ratings yet
NLP Lab Manual for CSE Students
45 pages
NLP Applications and Text Preprocessing
No ratings yet
NLP Applications and Text Preprocessing
56 pages
UNS-1C Flight Management System Manual
100% (1)
UNS-1C Flight Management System Manual
642 pages
TSC EDI 850 Purchase Order Guidelines
No ratings yet
TSC EDI 850 Purchase Order Guidelines
30 pages
MKTG Atlantic Computer Case Study Assignment
No ratings yet
MKTG Atlantic Computer Case Study Assignment
5 pages
Quick Guide On Operations Management With Analytics v2024
No ratings yet
Quick Guide On Operations Management With Analytics v2024
207 pages
Total Station OTS102N
No ratings yet
Total Station OTS102N
2 pages
Data Structure - 4th - 2023-2024
No ratings yet
Data Structure - 4th - 2023-2024
3 pages
Unit II
No ratings yet
Unit II
27 pages
YouMe Voice Engine Initialization Log
No ratings yet
YouMe Voice Engine Initialization Log
9 pages
EZVIZ H4 2K Smart Security Camera
No ratings yet
EZVIZ H4 2K Smart Security Camera
9 pages
AD and DA Converters
No ratings yet
AD and DA Converters
23 pages
Mad 4-1 r22 2026 NEW
No ratings yet
Mad 4-1 r22 2026 NEW
78 pages
Hotspot Login Session Issues Report
No ratings yet
Hotspot Login Session Issues Report
30 pages
Introduction to R Programming Basics
No ratings yet
Introduction to R Programming Basics
28 pages
Multiload II Modbus PDF
No ratings yet
Multiload II Modbus PDF
8 pages
Ultrasonic Generator HERMANN
100% (2)
Ultrasonic Generator HERMANN
88 pages
PowerHour - Hassan R Obeid PDF 2019-Jan-23
No ratings yet
PowerHour - Hassan R Obeid PDF 2019-Jan-23
38 pages
10th Class Mathematics Paper
No ratings yet
10th Class Mathematics Paper
4 pages
ACDC Assignment3
No ratings yet
ACDC Assignment3
5 pages
SRM Institute of Science and Technology: Title:Payroll Management System
No ratings yet
SRM Institute of Science and Technology: Title:Payroll Management System
20 pages
DL Unit 5
No ratings yet
DL Unit 5
2 pages
History of Editorial Design
No ratings yet
History of Editorial Design
46 pages
M TechCSDF
No ratings yet
M TechCSDF
2 pages
Tableau Forensic TD4 Duplicator
No ratings yet
Tableau Forensic TD4 Duplicator
3 pages
Gsens LWG 610: A Key Component For Mobile Machines With Telescoping Booms
100% (2)
Gsens LWG 610: A Key Component For Mobile Machines With Telescoping Booms
5 pages
Sandhya Ranganathan's Professional Profile
No ratings yet
Sandhya Ranganathan's Professional Profile
5 pages
Freshbook-Group-1 20250110 221309 0000
No ratings yet
Freshbook-Group-1 20250110 221309 0000
12 pages
SCD Aisc 360 22
No ratings yet
SCD Aisc 360 22
28 pages
eCTD Submission Specifications Guide
No ratings yet
eCTD Submission Specifications Guide
5 pages
SCHMIDT Complete Catalog
No ratings yet
SCHMIDT Complete Catalog
86 pages
BSc in Programming & Data Science Fees
No ratings yet
BSc in Programming & Data Science Fees
3 pages

123 NLP 456

Uploaded by

123 NLP 456

Uploaded by

Experiment-01

# Stopword & Punctuation Removal

# Script Validation (Only English)

# Download necessary NLTK resources

# Tokenize and filter out stopwords and non-alphabetic words

# Get POS tags for the filtered words

# Download necessary NLTK resources (only once)

# Function to extract root and suffix

# Sample input text

# Process each word

# Generate and compute frequencies for unigrams, bigrams, and trigrams

def spell_correct(word, dictionary):

# Sample dictionary of correct words

# List of misspelled words to correct

# Run spell correction

# Perform POS tagging

from transformers import pipeline

# Load NER pipeline using a pre-trained BERT model

# Print the named entities

# Smaller dataset and labels

# Initialize and fit TF-IDF vectorizer

# Train Logistic Regression model

# Evaluate the model

You might also like