Experiment-01
Apply various text preprocessing techniques for any given text : Tokenization and
Filtration & Script Validation.
import nltk, re, string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
text = "Hello! This is a sample text for NLP preprocessing. नमस्ते!. i am Kajol"
# Tokenization
tokens = word_tokenize(text)
sentences = sent_tokenize(text)
# Stopword & Punctuation Removal
stops = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stops and w not in
string.punctuation]
# Script Validation (Only English)
english_words = [w for w in filtered if re.fullmatch(r'[A-Za-z]+', w)]
print("Tokens:", tokens)
print("Sentences:", sentences)
print("Filtered:", filtered)
print("Script Validation:", english_words)
Experiment-2
Apply various other text preprocessing techniques for any given text : Stop Word
Removal, Lemmatization / Stemming.
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# Sample text
text = "The striped bats are hanging on their feet for best."
# Tokenize and filter out stopwords and non-alphabetic words
filtered = [w for w in word_tokenize(text) if w.isalpha() and w.lower() not in
stopwords.words('english')]
# Get POS tags for the filtered words
pos_tags = pos_tag(filtered)
# Stemming
stemmed = [PorterStemmer().stem(w) for w in filtered]
# Lemmatization using POS tags
lemmatized = [WordNetLemmatizer().lemmatize(w, pos={'J': 'a', 'V': 'v', 'N': 'n',
'R': 'r'}.get(tag[0], 'n')) for w, tag in pos_tags]
# Output
print("Filtered:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)
Experiment-03
Perform morphological analysis and word generation for any given text.
import nltk
import string
# Download necessary NLTK resources (only once)
nltk.download('punkt')
nltk.download('punkt_tab')
# Function to extract root and suffix
def extract_root_and_suffix(word):
suffixes = ["ing", "ed", "ly", "ness", "s", "es", "ment", "able", "tion"]
for suffix in suffixes:
if word.endswith(suffix) and len(word) > len(suffix) + 1:
return word[:-len(suffix)], suffix
return word, "None"
# Sample input text
input_text = "I am enjoying the study of happiness."
# Tokenize words
words = nltk.word_tokenize(input_text)
# Process each word
for word in words:
word_cleaned = word.strip(string.punctuation) # Remove punctuation
if word_cleaned:
root, suffix = extract_root_and_suffix(word_cleaned.lower())
print(f"Word: {word_cleaned}")
print(f"Root word: {root}")
print(f"Suffix: {suffix}")
print(f"Length of root word: {len(root)}")
print("=" * 40)
Experiment-04
Implement N-Gram model for sentence probability.
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# Sample text
text = "This is a sample sentence for N-gram language modeling."
tokens = word_tokenize(text.lower()) # Tokenize and convert to lowercase
# Generate and compute frequencies for unigrams, bigrams, and trigrams
for n in [1, 2, 3]:
ngrams_list = list(ngrams(tokens, n))
print(f"\n{['Unigrams', 'Bigrams', 'Trigrams'][n-1]} Frequencies:")
for ngram, freq in FreqDist(ngrams_list).items():
print(f"{ngram}")
Experiment-5
Implement edit distance for spell correction
import nltk
from nltk.metrics import edit_distance
def spell_correct(word, dictionary):
"""Returns the closest word from the dictionary based on edit distance."""
candidates = [(w, edit_distance(word, w)) for w in dictionary]
return min(candidates, key=lambda x: x[1])[0]
# Sample dictionary of correct words
dictionary = {
"language", "modeling", "sentence", "sample",
"market", "stock", "computer", "science"
}
# List of misspelled words to correct
misspelled_words = ["langauge", "modling", "sentece", "sampel", "marcket"]
# Run spell correction
for word in misspelled_words:
corrected = spell_correct(word, dictionary)
print(f"Misspelled: {word} -> Corrected: {corrected}")
Experiment-06
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
def pos_tagging(text):
"""Performs POS tagging on the given text."""
tokens = word_tokenize(text)
return pos_tag(tokens)
# Example text
text = "This is a sample sentence for POS tagging."
# Perform POS tagging
tagged_words = pos_tagging(text)
# Print results
for word, tag in tagged_words:
print(f"{word}: {tag}")
Experiment-07
from transformers import pipeline
# Load NER pipeline using a pre-trained BERT model
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-
english", aggregation_strategy="simple")
# Example text
text = "Barack Obama was the 44th President of the United States. He was born in
Hawaii."
# Perform NER
entities = ner_pipeline(text)
# Print the named entities
for entity in entities:
print(f"{entity['word']} --> {entity['entity_group']}")
Experiment-08
To implement supervised classifiers for WSD
import nltk
from nltk.corpus import semcor, wordnet as wn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
nltk.download('semcor')
nltk.download('wordnet')
# Smaller dataset and labels
data, labels = zip(*[(f'{" ".join(s)}', wn.synsets(w)[0].name())
for s in semcor.sents()[:100] for w in s if len(wn.synsets(w))
> 1])
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
# Initialize and fit TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
# Train Logistic Regression model
clf = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)
# Evaluate the model
print(classification_report(y_test, clf.predict(X_test_tfidf)))