0% found this document useful (0 votes)
28 views4 pages

123 NLP 456

The document outlines a series of experiments focused on various text preprocessing techniques in natural language processing (NLP). It covers methods such as tokenization, stop word removal, lemmatization, morphological analysis, N-gram modeling, spell correction, part-of-speech tagging, named entity recognition, and supervised classifiers for word sense disambiguation. Each experiment includes code snippets demonstrating the application of these techniques using the NLTK library and other tools.

Uploaded by

rjcsahil12sci30
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
28 views4 pages

123 NLP 456

The document outlines a series of experiments focused on various text preprocessing techniques in natural language processing (NLP). It covers methods such as tokenization, stop word removal, lemmatization, morphological analysis, N-gram modeling, spell correction, part-of-speech tagging, named entity recognition, and supervised classifiers for word sense disambiguation. Each experiment includes code snippets demonstrating the application of these techniques using the NLTK library and other tools.

Uploaded by

rjcsahil12sci30
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

Experiment-01

Apply various text preprocessing techniques for any given text : Tokenization and
Filtration & Script Validation.
import nltk, re, string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

text = "Hello! This is a sample text for NLP preprocessing. नमस्ते!. i am Kajol"

# Tokenization
tokens = word_tokenize(text)
sentences = sent_tokenize(text)

# Stopword & Punctuation Removal


stops = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stops and w not in
string.punctuation]

# Script Validation (Only English)


english_words = [w for w in filtered if re.fullmatch(r'[A-Za-z]+', w)]

print("Tokens:", tokens)
print("Sentences:", sentences)
print("Filtered:", filtered)
print("Script Validation:", english_words)

Experiment-2
Apply various other text preprocessing techniques for any given text : Stop Word
Removal, Lemmatization / Stemming.

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK resources


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The striped bats are hanging on their feet for best."

# Tokenize and filter out stopwords and non-alphabetic words


filtered = [w for w in word_tokenize(text) if w.isalpha() and w.lower() not in
stopwords.words('english')]

# Get POS tags for the filtered words


pos_tags = pos_tag(filtered)

# Stemming
stemmed = [PorterStemmer().stem(w) for w in filtered]
# Lemmatization using POS tags
lemmatized = [WordNetLemmatizer().lemmatize(w, pos={'J': 'a', 'V': 'v', 'N': 'n',
'R': 'r'}.get(tag[0], 'n')) for w, tag in pos_tags]

# Output
print("Filtered:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)

Experiment-03
Perform morphological analysis and word generation for any given text.

import nltk
import string

# Download necessary NLTK resources (only once)


nltk.download('punkt')
nltk.download('punkt_tab')

# Function to extract root and suffix


def extract_root_and_suffix(word):
suffixes = ["ing", "ed", "ly", "ness", "s", "es", "ment", "able", "tion"]
for suffix in suffixes:
if word.endswith(suffix) and len(word) > len(suffix) + 1:
return word[:-len(suffix)], suffix
return word, "None"

# Sample input text


input_text = "I am enjoying the study of happiness."

# Tokenize words
words = nltk.word_tokenize(input_text)

# Process each word


for word in words:
word_cleaned = word.strip(string.punctuation) # Remove punctuation
if word_cleaned:
root, suffix = extract_root_and_suffix(word_cleaned.lower())
print(f"Word: {word_cleaned}")
print(f"Root word: {root}")
print(f"Suffix: {suffix}")
print(f"Length of root word: {len(root)}")
print("=" * 40)

Experiment-04
Implement N-Gram model for sentence probability.

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Sample text
text = "This is a sample sentence for N-gram language modeling."
tokens = word_tokenize(text.lower()) # Tokenize and convert to lowercase

# Generate and compute frequencies for unigrams, bigrams, and trigrams


for n in [1, 2, 3]:
ngrams_list = list(ngrams(tokens, n))
print(f"\n{['Unigrams', 'Bigrams', 'Trigrams'][n-1]} Frequencies:")
for ngram, freq in FreqDist(ngrams_list).items():
print(f"{ngram}")

Experiment-5
Implement edit distance for spell correction

import nltk
from nltk.metrics import edit_distance

def spell_correct(word, dictionary):


"""Returns the closest word from the dictionary based on edit distance."""
candidates = [(w, edit_distance(word, w)) for w in dictionary]
return min(candidates, key=lambda x: x[1])[0]

# Sample dictionary of correct words


dictionary = {
"language", "modeling", "sentence", "sample",
"market", "stock", "computer", "science"
}

# List of misspelled words to correct


misspelled_words = ["langauge", "modling", "sentece", "sampel", "marcket"]

# Run spell correction


for word in misspelled_words:
corrected = spell_correct(word, dictionary)
print(f"Misspelled: {word} -> Corrected: {corrected}")

Experiment-06

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def pos_tagging(text):
"""Performs POS tagging on the given text."""
tokens = word_tokenize(text)
return pos_tag(tokens)

# Example text
text = "This is a sample sentence for POS tagging."

# Perform POS tagging


tagged_words = pos_tagging(text)

# Print results
for word, tag in tagged_words:
print(f"{word}: {tag}")

Experiment-07

from transformers import pipeline

# Load NER pipeline using a pre-trained BERT model


ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-
english", aggregation_strategy="simple")

# Example text
text = "Barack Obama was the 44th President of the United States. He was born in
Hawaii."

# Perform NER
entities = ner_pipeline(text)

# Print the named entities


for entity in entities:
print(f"{entity['word']} --> {entity['entity_group']}")

Experiment-08
To implement supervised classifiers for WSD

import nltk
from nltk.corpus import semcor, wordnet as wn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('semcor')
nltk.download('wordnet')

# Smaller dataset and labels


data, labels = zip(*[(f'{" ".join(s)}', wn.synsets(w)[0].name())
for s in semcor.sents()[:100] for w in s if len(wn.synsets(w))
> 1])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

# Initialize and fit TF-IDF vectorizer


vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model


clf = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)

# Evaluate the model


print(classification_report(y_test, clf.predict(X_test_tfidf)))

You might also like