Steps for Install nltk library
Install python latest version python 3.9.6
Select command prompt and type text as python –version and click on enter
Type text as pip –version and click on enter
Type text as pip install nltk and click on enter
Open IDLE shell click on file and select new file
Click on save as give the program name with extension .py
Click on run select run module
If any package not found in nltk library go to IDLE shell type below commands and click on enter
import nltk
nltk.download()
Example:
nltk.download('stopwords')
1. Write a Python program to perform following tasks on text
a) Tokenization
Word tokenization:
import nltk
word_data = "It originated from the idea that there are readers who prefer learning new skills
from the comforts of their drawing rooms"
nltk_tokens = nltk.word_tokenize(word_data)
print (nltk_tokens)
Output:
['It', 'originated', 'from', 'the', 'idea', 'that', 'there', 'are', 'readers', 'who', 'prefer', 'learning',
'new', 'skills', 'from', 'the', 'comforts', 'of', 'their', 'drawing', 'rooms']
Sentence tokenization:
import nltk
sentence_data = "The First sentence is about Python. The Second: about Django. You can learn
Python,Django and Data Ananlysis here. "
nltk_tokens = nltk.sent_tokenize(sentence_data)
print (nltk_tokens)
Output:
['The First sentence is about Python.', 'The Second: about Django.', 'You can learn
Python,Django and Data Ananlysis here.']
Character tokenization
Import nltk
charact_data=" Python programming"
charact_tokens=list(charact_data)
print(charact_tokens)
Output:
['P', 'y', 't', 'h', 'o', 'n', 'p', 'r', 'o', 'g', 'r', 'a', 'm', 'm', 'i', 'n', 'g'
b) Stop word Removal
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))
all_words = ['There', 'is', 'a', 'tree','near','the','river']
for word in all_words:
if word not in en_stops:
print(word)
Output:
There
tree
near
river
2) Write a Python program to implement Porter stemmer algorithm for stemming
import nltk
from nltk.stem import Porter Stemmer
nltk.download('punkt')
stemmer=PorterStemmer()
words=["running","beautifulness","rivers","caresses","happily","studies","banking"]
stemmed_words=[stemmer.stem(word) for word in words]
print("Original Words:",words)
print("Stemmed Words",stemmed_words)
Output:
Original Words: ['running', 'beautifulness', 'rivers', 'caresses', 'happily', 'studies', 'banking']
Stemmed Words ['run', 'beauti', 'river', 'caress', 'happili', 'studi', 'bank']
3) Write a Python program for
a) Word Analysis
import re
from collections import Counter
def word_analysis(text):
# Convert text to lowercase and remove punctuation
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
# Split text into words
words = text.split()
# Count frequency of each word
word_freq = Counter(words)
# Calculate length of each word
word_len = {word: len(word) for word in words}
# Identify most common words
most_common_words = word_freq.most_common(10)
return word_freq, word_len, most_common_words
text = "This is an example sentence for word analysis. This sentence is just an example."
word_freq, word_len, most_common_words = word_analysis(text)
print("Word Frequency:")
for word, freq in word_freq.items():
print(f"{word}: {freq}")
print("\nWord Length:")
for word, length in word_len.items():
print(f"{word}: {length}")
print("\nMost Common Words:")
for word, freq in most_common_words:
print(f"{word}: {freq}")
Output:
Word Frequency:
this: 2
is: 2
an: 2
example: 2
sentence: 2
for: 1
word: 1
analysis: 1
just: 1
Word Length:
this: 4
is: 2
an: 2
example: 7
sentence: 8
for: 3
word: 4
analysis: 8
just: 4
Most Common Words:
this: 2
is: 2
an: 2
example: 2
sentence: 2
for: 1
word: 1
analysis: 1
just: 1
b) Word Generation
import random
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
def generate_meaningful_words(part_of_speech, num_words):
synsets = list(wordnet.all_synsets(part_of_speech))
words = []
for _ in range(num_words):
synset = random.choice(synsets)
lemma = random.choice(synset.lemmas())
words.append(lemma.name())
return words
nouns = generate_meaningful_words('n', 10)
verbs = generate_meaningful_words('v', 10)
adjectives = generate_meaningful_words('a', 10)
adverbs = generate_meaningful_words('r', 10)
print("Nouns:")
for noun in nouns:
print(noun)
print("\nVerbs:")
for verb in verbs:
print(verb)
print("\nAdjectives:")
for adjective in adjectives:
print(adjective)
print("\nAdverbs:")
for adverb in adverbs:
print(adverb)
Output:
Nouns:
Haastia_pulvinaris
televangelist
genus_Estrilda
E._H._Weber
insidiousness
Evangelical_and_Reformed_Church
garnishee
semigloss
powder_keg
townspeople
Verbs:
encapsulate
remain
salve
cruise
credit
charge
drone_on
up
fume
sandblast
Adjectives:
stipendiary
reportable
stilly
live
adscititious
bindable
upper-class
god-awful
organized
untechnical
Adverbs:
pitty-patty
naturally
managerially
smartly
providently
dumbly
worse
tight
magniloquently
pointlessly
4. Create a sample list of at least 5 words with ambiguous sense and write a python program
to implement WSD.
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
# Ensure that NLTK resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# List of ambiguous words
ambiguous_words = ["bank", "bat", "bark", "pitch", "lead"]
# Example context sentences for each word
contexts = {
"bank": "I went to the river bank to relax by the water.",
"bat": "The bat flew out of the cave at dusk.",
"bark": "The dog barked loudly at the stranger.",
"pitch": "He gave a brilliant pitch to the investors.",
"lead": "He decided to lead the team on the project."
# Function to disambiguate word senses using Lesk algorithm
def disambiguate_word(word, context):
sense = lesk(context.split(), word)
if sense:
return sense.name() # Return the sense (meaning) of the word
else:
return "No sense found"
# Iterate over each ambiguous word and print its disambiguated sense based on context
for word in ambiguous_words:
context = contexts[word]
print(f"Word: {word}")
print(f"Context: {context}")
print(f"Disambiguated Sense: {disambiguate_word(word, context)}")
print("-" * 50)
Output:
Word: bank
Context: I went to the river bank to relax by the water.
Disambiguated Sense: bank.v.07
--------------------------------------------------
Word: bat
Context: The bat flew out of the cave at dusk.
Disambiguated Sense: bat.v.03
--------------------------------------------------
Word: bark
Context: The dog barked loudly at the stranger.
Disambiguated Sense: bark.n.04
--------------------------------------------------
Word: pitch
Context: He gave a brilliant pitch to the investors.
Disambiguated Sense: pitch.v.04
--------------------------------------------------
Word: lead
Context: He decided to lead the team on the project.
Disambiguated Sense: spark_advance.n.01
--------------------------------------------------
5. Install NLTK tool kit and perform stemming
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['running', 'jumping', 'hiking', 'swimming']
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)
Output:
['run', 'jump', 'hike', 'swim']
6. Create Sample list of at least 10 words POS tagging and find the POS for any given word
import nltk
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def find_pos_tag(word):
tokens = word_tokenize(word)
pos_tags = pos_tag(tokens)
return pos_tags[0][1]
word = input("Enter a word: ")
pos_tag = find_pos_tag(word)
print("The POS tag for '{}' is '{}'".format(word, pos_tag))
Output:
Enter a word: say
The POS tag for 'say' is 'VB'
Enter a word: the
The POS tag for 'the' is 'DT'
Enter a word: karimnagar
The POS tag for 'karimnagar' is 'NN'
Enter a word: good
The POS tag for 'good' is 'JJ'
7. Write a Python program to
a) Perform Morphological Analysis using NLTK library
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def morphological_analysis(text):
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
lemmatized_tokens = []
for token, tag in tagged_tokens:
if tag.startswith('J'):
wordnet_tag = 'a'
elif tag.startswith('V'):
wordnet_tag = 'v'
elif tag.startswith('N'):
wordnet_tag = 'n'
elif tag.startswith('R'):
wordnet_tag = 'r'
else:
wordnet_tag = ''
if wordnet_tag:
lemmatized_token = lemmatizer.lemmatize(token, wordnet_tag)
else:
lemmatized_token = token
lemmatized_tokens.append(lemmatized_token)
return lemmatized_tokens
text = "The quick brown fox jumps over the lazy dog."
print("Original Text:")
print(text)
print("\nLemmatized Tokens:")
print(morphological_analysis(text))
Output:
Original Text:
The quick brown fox jumps over the lazy dog.
Lemmatized Tokens:
['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']
b) Generate n-grams using NLTK N-Grams library