Ds File

1.Write python code to flatten and evaluate a deep tree in NLP.
Code:
#Class for flattening the deep tree
from nltk.tree import Tree
def flatten_childtrees(trees):
children = []
for t in trees:
if t.height() < 3:
children.extend(t.pos())
elif t.height() == 3:
children.append(Tree(t.label(), t.pos()))
else:
children.extend(flatten_childtrees(t))
return children
def flatten_deeptree(tree):
return Tree(tree.label(), flatten_childtrees(tree))
# Evaluating flatten_deeptree()
from nltk.corpus import treebank
deep_tree = treebank.parsed_sents()[0]
flattened_tree = flatten_deeptree(deep_tree)
print("DeepTree:\n", deep_tree)
print("\nFlattened Tree:\n", flattened_tree)
#height
from transforms import flatten_deeptree
print("Height:", Tree('NNP', ['Pierre']).height())
1
print("\nHeight:", Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height())
Output:
2
2. Create Shallow Tree in NLP and prints its height.
Code:
#Understanding shallow_tree()
def shallow_tree(tree):
children = []
for t in tree:
if t.height() < 3:
children.extend(t.pos())
else:
children.append(Tree(t.label(), t.pos()))
return Tree(tree.label(), children)
#Evaluating
from transforms import shallow_tree
print("Deep Tree:\n", treebank.parsed_sents()[0])

print("\nShallow Tree:\n", shallow_tree(treebank.parsed_sents()[0]))
print ("height of tree : ", treebank.parsed_sents()[0].height())

print ("\nheight of shallow tree : ", shallow_tree(treebank.parsed_sents()[0]).height())
3
Output:
4
3. Download wine quality data set from the UCI Machine Learning Repository which is
available for free. Then print data of five rows of red and white wines. Check for NULL
Values in red wine. Create a histogram to show distribution of alcohol and finally split
the data for training and validation.
Code:
import pandas as pd
# Read red wine data

ds_red = pd.read_csv("winequality-red.csv", sep=';')
# Display five rows of red wine data

print("Red Wine Data:")
print(ds_red.head())
# Read white wine data

ds_wht = pd.read_csv("winequality-white.csv", sep=';')
# Display five rows of white wine data

print("\nWhite Wine Data:")
print(ds_wht.head())
# Check for NULL values in red wine data

print("\nNull Values in Red Wine Data:")
print(ds_red.isnull().sum())
# Check for NULL values in white wine data

print("\nNull Values in White Wine Data:")
print(ds_wht.isnull().sum())
5
import matplotlib.pyplot as plt
# Create a histogram for red wine's alcohol content

plt.hist(ds_red['alcohol'], bins=20, color="red", alpha=0.7, label='Red Wine')
plt.xlabel('Alcohol Content')
plt.ylabel('Frequency')
plt.title('Distribution of Alcohol Content in Red Wine')
plt.legend()
plt.show()
# Create a histogram for White wine's alcohol content

plt.hist(ds_wht['alcohol'], bins=20, color="blue", alpha=0.7, label='White Wine')
plt.xlabel('Alcohol Content')
plt.ylabel('Frequency')
plt.title('Distribution of Alcohol Content in White Wine')
plt.legend()
plt.show()
from sklearn.model_selection import train_test_split
# Assuming 'quality' is the target variable for red wine

X_red = ds_red.drop('quality', axis=1)
y_red = ds_red['quality']
# Split red wine data into training and validation sets

X_train_red, X_valid_red, y_train_red, y_valid_red = train_test_split(X_red, y_red,
test_size=0.2, random_state=42)
# Assuming 'quality' is the target variable for white wine
6
X_wht = ds_wht.drop('quality', axis=1)
y_wht = ds_wht['quality']
# Split white wine data into training and validation sets

X_train_wht, X_valid_wht, y_train_wht, y_valid_wht = train_test_split(X_wht, y_wht,
test_size=0.2, random_state=42)
Output:
7
8
9
4. Avengers Endgame and Deep Learning. Write python code to implement Image
Caption Generation using the Avengers End Games Characters.
Code:
import torch
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from PIL import Image
# this file is located in pytorch tutorial/image

# captioning which we pull from git remember
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
# Model path
# make sure path must correct

ENCODER_PATH = 'content/encoder-5-3000.pkl'
DECODER_PATH = 'content/decoder-5-3000.pkl'
VOCAB_PATH = 'content/vocab.pkl'
# CONSTANTS because of architecture what we are using

EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1
# Device configuration snippet

device = torch.cuda.device(0) # 0 represent default device
10
# Function to Load and Resize the image
def load_image(image_path, transform=None):
image = Image.open(image_path)
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
def PretrainedResNet(image_path, encoder_path=ENCODER_PATH,

decoder_path=DECODER_PATH,
vocab_path=VOCAB_PATH,
embed_size=EMBED_SIZE,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYERS):
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper

with open(vocab_path, 'rb') as f:
vocab = pickle.load(f)
# Build models
# eval mode (batchnorm uses moving mean/variance)
11
encoder = EncoderCNN(embed_size).eval()
decoder = DecoderRNN(embed_size, hidden_size,
len(vocab), num_layers)
encoder = encoder.to(device)
decoder = decoder.to(device)
# Load the trained model parameters

encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))
# Prepare an image
image = load_image(image_path, transform)
image_tensor = image.to(device)
# Generate a caption from the image

feature = encoder(image_tensor)
sampled_ids = decoder.sample(feature)
# (1, max_seq_length) -> (max_seq_length)

sampled_ids = sampled_ids[0].cpu().numpy()
# Convert word_ids to words

sampled_caption = []
for word_id in sampled_ids:
word = vocab.idx2word[word_id]
sampled_caption.append(word)
if word == '<end>':
break
sentence = ' '.join(sampled_caption)[8:-5].title()
12
# Print out the image and the generated caption
image = Image.open(image_path)
return sentence, image
#Test Image: Tony

plt.figure(figsize=(24,24))
predicted_label, image = PretrainedResNet(image_path='tony.jpg')
plt.imshow(image)
print(predicted_label)
#Test Image: Tony

plt.figure(figsize=(24,24))
predicted_label, image = PretrainedResNet(image_path='thor.jpg')
plt.imshow(image)
print(predicted_label)
Output:
13
5. Create a Neural network using Python (you can use NumPy to implement this).
Code:
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import cv2
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.layers.merge import add
from keras.utils import to_categorical, plot_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import glob
# Step 2: Load the descriptions

def load_description(text):
mapping = dict()
for line in text.split("\n"):
token = line.split("\t")
if len(line) < 2: # remove short descriptions
continue
img_id = token[0].split('.')[0] # name of the image
img_des = token[1] # description of the image
if img_id not in mapping:
mapping[img_id] = list()
14
mapping[img_id].append(img_des)
return mapping
token_path =
'/kaggle/input/flickr8k/flickr_data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'
text = open(token_path, 'r', encoding='utf-8').read()
descriptions = load_description(text)
print(descriptions['1000268201_693b08cb0e'])
# Step 3: Cleaning the text

def clean_description(desc):
for key, des_list in desc.items():
for i in range(len(des_list)):
caption = des_list[i]
caption = [ch for ch in caption if ch not in string.punctuation]
caption = ''.join(caption)
caption = caption.split(' ')
caption = [word.lower() for word in caption if len(word) > 1 and word.isalpha()]
caption = ' '.join(caption)
des_list[i] = caption
clean_description(descriptions)
print(descriptions['1000268201_693b08cb0e'])
# Step 4: Generate the Vocabulary

def to_vocab(desc):
words = set()
for key in desc.keys():
for line in desc[key]:
words.update(line.split())
return words
15
vocab = to_vocab(descriptions)
# Step 5: Load the images

images = '/kaggle/input/flickr8k/flickr_data/Flickr_Data/Images/'
img = glob.glob(images + '*.jpg')
train_path =
'/kaggle/input/flickr8k/flickr_data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
train_images = open(train_path, 'r', encoding='utf-8').read().split("\n")
train_img = [] # list of all images in the training set
for im in img:
if im[len(images):] in train_images:
train_img.append(im)
# Step 6: Extract the feature vector from all images

from keras.preprocessing.image import load_img, img_to_array
def preprocess_img(img_path):
img = load_img(img_path, target_size=(299, 299))
x = img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
return x
def encode(image):
image = preprocess_img(image)
vec = model.predict(image)
vec = np.reshape(vec, (vec.shape[1]))
return vec
base_model = InceptionV3(weights='imagenet')
16
model = Model(base_model.input, base_model.layers[-2].output)
encoding_train = {}
for img in train_img:
encoding_train[img[len(images):]] = encode(img)
# Step 7: Tokenizing the vocabulary

all_train_captions = []
for key, val in train_descriptions.items():
for caption in val:
all_train_captions.append(caption)
vocabulary = vocab
threshold = 10
word_counts = {}
for cap in all_train_captions:
for word in cap.split(' '):
word_counts[word] = word_counts.get(word, 0) + 1
vocab = [word for word in word_counts if word_counts[word] >= threshold]
ixtoword = {}
wordtoix = {}
ix = 1
for word in vocab:
wordtoix[word] = ix
ixtoword[ix] = word
ix += 1
max_length = max(len(des.split()) for des in all_train_captions)
17
max_length
# Step 8: Glove vector embeddings

X1, X2, y = list(), list(), list()
for key, des_list in train_descriptions.items():
pic = train_features[key + '.jpg']
for cap in des_list:
seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix]
for i in range(1, len(seq)):
in_seq, out_seq = seq[:i], seq[i]
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
X1.append(pic)
X2.append(in_seq)
y.append(out_seq)
X2 = np.array(X2)
X1 = np.array(X1)
y = np.array(y)
embeddings_index = {}
glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
glove = open(glove_path, 'r', encoding='utf-8').read()
for line in glove.split("\n"):
values = line.split(" ")
word = values[0]
indices = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = indices
emb_dim = 200
18
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, i in wordtoix.items():
emb_vec = embeddings_index.get(word)
if emb_vec is not None:
emb_matrix[i] = emb_vec
emb_matrix.shape
# Step 9: Define the model

ip1 = Input(shape=(2048,))
fe1 = Dropout(0.2)(ip1)
fe2 = Dense(256, activation='relu')(fe1)
ip2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, emb_dim, mask_zero=True)(ip2)
se2 = Dropout(0.2)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])

decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[ip1, ip2], outputs=outputs)
# Step 10: Training the model

model.layers[2].set_weights([emb_matrix])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit([X1, X2], y, epochs=50, batch_size=256)
19
# Step 11: Predicting the output
def greedy_search(pic):
start = 'startseq'
for i in range(max_length):
seq = [wordtoix[word] for word in start.split() if word in wordtoix]
seq = pad_sequences([seq], maxlen=max_length)
yhat = model.predict([pic, seq])
yhat = np.argmax(yhat)
word = ixtoword[yhat]
start += ' ' + word
if word == 'endseq':
break
final = start.split()
final = final[1:-1]
final = ' '.join(final)
return final
Output:
20
6. Implement Word Embedding using Word2Vec.
Code:
import re
def clean_text(string, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''', stop_words=['the', 'a',
'and', 'is', 'be', 'will']):
# Cleaning the urls
string = re.sub(r'https?://\S+|www\.\S+', '', string)
# Cleaning the html elements
string = re.sub(r'<.*?>', '', string)
# Removing the punctuations
for x in string.lower():
if x in punctuations:
string = string.replace(x, "")
# Converting the text to lower
string = string.lower()
# Removing stop words
string = ' '.join([word for word in string.split() if word not in stop_words])
# Cleaning the whitespaces
string = re.sub(r'\s+', ' ', string).strip()
return string
# Step 2: The full pipeline to create the (X, Y) word pairs given a list of strings texts:
window = 2
word_lists = []
all_text = []
for text in texts:

# Cleaning the text
text = clean_text(text)
# Appending to the all text list
all_text += text
21
# Creating a context dictionary
for i, word in enumerate(text):
for w in range(window):
# Getting the context that is ahead by *window* words
if i + 1 + w < len(text):
word_lists.append([word] + [text[(i + 1 + w)]])
# Getting the context that is behind by *window* words
if i - w - 1 >= 0:
word_lists.append([word] + [text[(i - w - 1)]])
# Step 3: Creation of a unique word dictionary

def create_unique_word_dict(text):
"""
A method that creates a dictionary where the keys are unique words
and key values are indices
"""
words = list(set(text))
words.sort()
unique_word_dict = {}
for i, word in enumerate(words):
unique_word_dict.update({word: i})
return unique_word_dict
# Step 4: Creating the X and Y matrices

n_words = len(unique_word_dict)
words = list(unique_word_dict.keys())
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
22
main_word_index = unique_word_dict.get(word_list[0])
context_word_index = unique_word_dict.get(word_list[1])
X_row = np.zeros(n_words)
Y_row = np.zeros(n_words)
X_row[main_word_index] = 1
Y_row[context_word_index] = 1
X.append(X_row)
Y.append(Y_row)
X = np.asarray(X)
Y = np.asarray(Y)
# Step 5: Training and obtaining weights

embed_size = 2
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(x=X, y=Y, batch_size=256, epochs=1000)
# Obtaining the weights from the neural network.

weights = model.get_weights()[0]
embedding_dict = {}
for word in words:
embedding_dict.update({word: weights[unique_word_dict.get(word)]})
# Step 6: Obtain the weights and plot the results

23
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
coord = embedding_dict.get(word)
plt.scatter(coord[0], coord[1])
plt.annotate(word, (coord[0], coord[1]))
plt.show()
Output:
24
7. Collocations are two or more words that tend to appear frequently together, for
example – United States. Implement this using Python.
Code:
#Loading libraries
from nltk.corpus import webtext
# use to find bigrams, which are pairs of words
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
#2 : Let’s find the collocations

# Loadingthe data
file_path = 'C:/Users/harsh/OneDrive/Desktop/grail.txt'
words = [w.lower() for w in webtext.words(file_path)]
bigram_collocation = BigramCollocationFinder.from_words(words)
bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)
#3.from nltk.corpusimport stopwords

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bigram_collocation.apply_word_filter(filter_stops)
bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)
from nltk.corpus import webtext, stopwords

from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
# Define the file path

file_path = 'C:/Users/harsh/OneDrive/Desktop/grail.txt'
25
words = [w.lower() for w in webtext.words(file_path)]
trigram_collocation =TrigramCollocationFinder.from_words(words)
trigram_collocation.apply_word_filter(filter_stops)
trigram_collocation.apply_freq_filter(3)
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 15)
Output:
26
8. WordNet is the lexical database i.e. dictionary for the English language, specifically
designed for natural language processing. Synset is a special kind of a simple interface
that is present in NLTK to look up words in WordNet. Synset instances are the
groupings of synonymous words that express the same concept Show working of these
using Python.
Code:
#1 : Understanding Synset
from nltk.corpus import wordnet
syn = wordnet.synsets("hello")[0]
print("Synsetname : ", syn.name())
# Defining the word
print("\nSynset meaning:", syn.definition())
# listof phrases that use the word in context
print("\nSynsetexample:", syn.examples())
#2 : Understanding Hypernerms and Hyponyms from nltk.corpus import wordnet

syn = wordnet.synsets('hello')[0]
print("Synsetname :", syn.name())
print("\nSynsetabstractterm: ", syn.hypernyms())
print("\nSynsetspecificterm: ",
syn.hypernyms()[0].hyponyms())
syn.root_hypernyms()
print("\nSynsetroothypernerm:", syn.root_hypernyms())
#3 : Part of Speech (POS) in Synset.

syn = wordnet.synsets('hello')[0]
print("Syntag: ", syn.pos())
syn = wordnet.synsets('doing')[0]
print("Syntag:", syn.pos())
syn = wordnet.synsets('beautiful')[0]
syn = wordnet.synsets('quickly')[0]
27
Output:
28
9. Implement Naïve Baye’s Classifier using python.
Code:
# Importing libraries
import math
import random
import csv
# Encode class names to numeric values (e.g., yes and no encoded to 1 and 0)
def encode_class(mydata):
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata
# Splitting the data

def splitting(mydata, ratio):
train_num = int(len(mydata) * ratio)
train = []
test = list(mydata) # initially testset will have all the dataset
while len(train) < train_num:
index = random.randrange(len(test)) # index generated randomly from range 0 to length
of testset
train.append(test.pop(index))
return train, test
# Group the data rows under each class (e.g., yes or no) in dictionary (dict[yes] and dict[no])
29
def groupUnderClass(mydata):
dict = {}
if mydata[i][-1] not in dict:
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict
# Calculating Mean
def mean(numbers):
return sum(numbers) / float(len(numbers))
# Calculating Standard Deviation

def std_dev(numbers):
if len(numbers) < 2:
return 0
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
# Calculate Mean and Standard Deviation

def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
del info[-1] # delete summaries of the last class
return info
# Find Mean and Standard Deviation under each class

def MeanAndStdDevForClass(mydata):
info = {}
dict = groupUnderClass(mydata)
30
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info
# Calculate Gaussian Probability Density Function

def calculateGaussianProbability(x, mean, stdev):
expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo
# Calculate Class Probabilities

def calculateClassProbabilities(info, test):
probabilities = {}
for classValue, classSummaries in info.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, std_dev = classSummaries[i]
x = test[i]
probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
return probabilities
# Make prediction - highest probability is the prediction

def predict(info, test):
probabilities = calculateClassProbabilities(info, test)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
31
# Returns predictions for a set of examples
def getPredictions(info, test):
predictions = []
for i in range(len(test)):
result = predict(info, test[i])
predictions.append(result)
return predictions
# Accuracy score
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
# Driver code
filename = 'nb.csv'
# Load the file and store it in mydata list
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
mydata[i] = [float(x) for x in mydata[i]]
# Split ratio = 0.7 (70% of data is training data and 30% is test data used for testing)
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
32
print("Test examples are: ", len(test_data))
# Prepare model
info = MeanAndStdDevForClass(train_data)
# Test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)
Output:
33
10. Twitter Sentiment Analysis using Python. Fetch tweets from twitter using python
and implement it.
Code:
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob
class TwitterClient(object):
'''Generic Twitter Class for sentiment analysis. '''
def __init__(self):
'''
Class constructor or initialization method.
'''
# keys and tokens from the Twitter Dev Console
consumer_key = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
consumer_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'
access_token = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
access_token_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'
# attempt authentication
try:
# create OAuthHandler object
self.auth = OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
self.auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
self.api = tweepy.API(self.auth)
except:
print("Error: Authentication Failed")
34
def clean_tweet(self, tweet):
''' Utility function to clean tweet text by removing links, special characters
using simple regex statements. '''
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
tweet).split())
def get_tweet_sentiment(self, tweet):

'''
Utility function to classify sentiment of passed tweet
using textblob's sentiment method.
'''
# create TextBlob object of passed tweet text
analysis = TextBlob(self.clean_tweet(tweet))
# set sentiment
if analysis.sentiment.polarity > 0:
return 'positive'
elif analysis.sentiment.polarity == 0:
return 'neutral'
else:
return 'negative'
def get_tweets(self, query, count=10):

'''
Main function to fetch tweets and parse them.
'''
# empty list to store parsed tweets
tweets = []
try:
# call twitter api to fetch tweets
fetched_tweets = self.api.search(q=query, count=count)
# parsing tweets one by one
35
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
# saving sentiment of tweet
parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
# appending parsed tweet to tweets list
if tweet.retweet_count > 0:
# if tweet has retweets, ensure that it is appended only once
if parsed_tweet not in tweets:
tweets.append(parsed_tweet)
else:
tweets.append(parsed_tweet)
# return parsed tweets
return tweets
except tweepy.TweepError as e:
# print error (if any)
print("Error: " + str(e))
def main():
# creating object of TwitterClient Class
api = TwitterClient()
# calling function to get tweets
tweets = api.get_tweets(query='Donald Trump', count=200)
# picking positive tweets from tweets
ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
# percentage of positive tweets
print("Positive tweets percentage: {} %".format(100 * len(ptweets) / len(tweets)))
# picking negative tweets from tweets
36
ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
# percentage of negative tweets
print("Negative tweets percentage: {} %".format(100 * len(ntweets) / len(tweets)))
# percentage of neutral tweets
print("Neutral tweets percentage: {} %".format(100 * (len(tweets) - (len(ntweets) +
len(ptweets))) / len(tweets)))
# printing first 10 positive tweets

print("\n\nPositive tweets:")
for tweet in ptweets[:10]:
print(tweet['text'])
# printing first 10 negative tweets

print("\n\nNegative tweets:")
for tweet in ntweets[:10]:
print(tweet['text'])
if __name__ == "__main__":
# calling main function
main()
Output:
37
11. Build a predictive model to forecast sales based on historical data?
Code:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Generate sample data (you would replace this with your actual dataset)
np.random.seed(42)
data = {'Month': pd.date_range(start='2022-01-01', periods=12, freq='M'),
'Sales': np.random.randint(50, 200, size=12)}
df = pd.DataFrame(data)
# Feature engineering (you might have more features in a real dataset)
df['Month_Num'] = df['Month'].dt.month
df['Month_Num^2'] = df['Month_Num'] ** 2
# Split the data into training and testing sets
X = df[['Month_Num', 'Month_Num^2']]
y = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
38
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Visualize the predictions
plt.scatter(X_test['Month_Num'], y_test, color='black', label='Actual')
plt.plot(X_test['Month_Num'], y_pred, color='blue', linewidth=3, label='Predicted')
plt.title('Sales Prediction')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.show()
Output:
39
12. Predict the outcome of a binary classification problem using a
machine learning algorithm?
Code:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = (iris.target == 0).astype(int) # 1 if the species is setosa, 0 otherwise
# Split the data into training and testing sets
# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
40
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')
41
13. perform sentiment analysis on a set of text data to understand the overall
sentiment
Code:
pip install nltk
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')
# Sample text data
text_data = [
"I love this product! It's amazing.",
"The customer service was terrible.",
"Neutral comment without much emotion.",
"This is not bad, but could be better."
# Initialize the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()
# Analyze sentiment for each text
for text in text_data:
sentiment_scores = sid.polarity_scores(text)
print(f"Text: {text}")
print(f"Sentiment Scores: {sentiment_scores}")
# Determine the overall sentiment
42
if sentiment_scores['compound'] >= 0.05:
sentiment = 'Positive'
elif sentiment_scores['compound'] <= -0.05:
sentiment = 'Negative'
else:
sentiment = 'Neutral'
print(f"Overall Sentiment: {sentiment}\n")
43
14. Identify distinct customer segments based on their behaviour or
characteristics?
Code:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Generate sample data (replace this with your actual customer data)
np.random.seed(42)
data = pd.DataFrame({
'Feature1': np.random.randint(1, 100, 100),
'Feature3': np.random.randint(1, 100, 100)
})
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# Apply k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_data)
# Reduce dimensions for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)
# Visualize the clusters
44
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=data['Cluster'], cmap='viridis')
plt.title('Customer Segmentation')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
45
15. Optimize hyperparameters of a model to achieve better performance.
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Generate a hypothetical dataset (replace this with your actual dataset)
np.random.seed(42)
X = np.random.rand(100, 5) # Features
y = np.random.randint(0, 2, 100) # Binary labels
# Split the dataset into training and testing sets
# Define the SVM model
svm_model = SVC()
# Define hyperparameters and their possible values for grid search
param_grid = {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto'],
# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
46
# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
# Use the best model to make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Hyperparameters: {best_params}')
print(f'Test Set Accuracy: {accuracy}')
Output:
47
16. Create informative and visually appealing plots to represent patterns and
relationships in the data.
Code:
import numpy as np
import pandas as pd
import seaborn as sns
# Generate a sample dataset
np.random.seed(42)
data_size = 100
feature1 = np.random.rand(data_size)
feature2 = 2 * feature1 + np.random.normal(scale=0.2, size=data_size)
category = np.random.choice(['A', 'B'], size=data_size)
df = pd.DataFrame({'Feature1': feature1, 'Feature2': feature2, 'Category': category})
# Set the style for the plots
sns.set(style="whitegrid")
# Pairplot to visualize relationships between numerical features
sns.pairplot(df, hue='Category', palette='Set1')
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()
# Scatter plot with regression line
sns.regplot(x='Feature1', y='Feature2', data=df, scatter_kws={'s': 50, 'alpha': 0.7}, line_kws={'color':

'red'})
plt.title('Scatter Plot with Regression Line')
plt.show()
48
# Box plot to visualize distributions
sns.boxplot(x='Category', y='Feature1', data=df, palette='Set2')
plt.title('Box Plot of Feature1 by Category')
plt.show()
# Violin plot to show the distribution of a numerical variable across different categories
sns.violinplot(x='Category', y='Feature2', data=df, inner='quartile', palette='Pastel1')
plt.title('Violin Plot of Feature2 by Category')
plt.show()
Output:
49
17. Explore techniques to handle missing data and outliers in a dataset.
Code:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats.mstats import winsorize
# Generate a sample dataset with missing data and outliers
np.random.seed(42)
data = {
'Feature2': np.concatenate([np.random.normal(50, 10, 25), [np.nan] * 25]),
'Feature3': np.concatenate([np.random.normal(100, 20, 25), np.random.normal(20, 5, 25)])
df = pd.DataFrame(data)
# Handling Missing Data
df.fillna(df.mean(), inplace=True) # Impute missing values with mean
# Handling Outliers
# Using Winsorizing to clip extreme values
df['Feature2'] = winsorize(df['Feature2'], limits=[0.05, 0.05])
# Using Z-scores to identify and remove outliers
z_scores = np.abs(stats.zscore(df['Feature3']))
df_no_outliers = df[(z_scores < 3)]
50
# Visualizing the original and cleaned data
plt.subplot(2, 2, 1)
sns.boxplot(x=df['Feature2'])
plt.title('Original Data - Feature2')
sns.boxplot(x=df_no_outliers['Feature2'])
plt.title('Data without Outliers - Feature2')
sns.histplot(df['Feature3'], kde=True)
plt.title('Original Data - Feature3')
sns.histplot(df_no_outliers['Feature3'], kde=True)
plt.title('Data without Outliers - Feature3')
plt.tight_layout()
plt.show()
Output:
51
18.
Code:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
import spacy
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# Define a function to extract entities using NLTK
def extract_entities_nltk(text):
words = word_tokenize(text)
tagged_words = pos_tag(words)
named_entities = ne_chunk(tagged_words)
entities = []
for entity in named_entities:
if isinstance(entity, nltk.Tree):
entities.append(" ".join([word for word, tag in entity.leaves()])
return entities
# Define a function to extract entities using spaCy
def extract_entities_spacy(text):
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
entities = [ent.text for ent in doc.ents]
return entities
# Sample text
text = "Apple Inc. is planning to open a new store in New York City."
# Extract entities using NLTK
entities_nltk = extract_entities_nltk(text)
52
print("Named Entities (NLTK):", entities_nltk)
# Extract entities using spaCy
entities_spacy = extract_entities_spacy(text)
print("Named Entities (spaCy):", entities_spacy)
Output:
19. Build a forecasting model for a time series dataset, and evaluate its
accuracy.
Code:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt
# Generate a hypothetical time series dataset
np.random.seed(42)
date_rng = pd.date_range(start='2022-01-01', end='2022-12-31', freq='D')
time_series_data = np.random.randn(len(date_rng)) + np.arange(len(date_rng)) * 0.2
# Create a DataFrame
df = pd.DataFrame(index=date_rng, data={'Value': time_series_data})
# Split the data into training and test sets
train_size = int(len(df) * 0.8)
53
train, test = df.iloc[:train_size], df.iloc[train_size:]
# Plot the training and test sets
plt.plot(train.index, train['Value'], label='Training Set')
plt.plot(test.index, test['Value'], label='Test Set')
plt.title('Hypothetical Time Series Data - Training and Test Sets')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()
# Train the ARIMA model on the training set
order = (1, 1, 1) # ARIMA(p, d, q) order
model = ARIMA(train['Value'], order=order)
fit_model = model.fit()
predictions = fit_model.forecast(steps=len(test))
# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)
rmse = sqrt(mean_squared_error(test['Value'], predictions))
# Plot the predicted values against the actual values
plt.plot(test.index, test['Value'], label='Actual Values')
plt.plot(test.index, predictions, label='Predicted Values', linestyle='dashed')
plt.title(f'ARIMA Forecasting - RMSE: {rmse:.4f}')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
54
plt.show()
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
Output:
55
20. Design and implement a time series forecasting model to predict future
temperatures. Evaluate the model's performance and visualize the predictions
against the actual temperatures.
Code:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt
# Generate a hypothetical time series dataset of daily temperatures
np.random.seed(42)
date_rng = pd.date_range(start='2010-01-01', end='2022-12-31', freq='D')
temperature_data = 25 + 0.5 * np.sin(np.arange(len(date_rng)) * (2 * np.pi / 365)) +

np.random.normal(scale=2, size=len(date_rng))
# Create a DataFrame
temperature_df = pd.DataFrame(index=date_rng, data={'Temperature': temperature_data})
# Split the data into training and test sets
train_size = int(len(temperature_df) * 0.8)
train, test = temperature_df.iloc[:train_size], temperature_df.iloc[train_size:]
# Plot the training and test sets
plt.plot(train.index, train['Temperature'], label='Training Set')
plt.plot(test.index, test['Temperature'], label='Test Set')
plt.title('Hypothetical Daily Temperature Time Series Data - Training and Test Sets')
plt.xlabel('Date')
plt.ylabel('Temperature')
56
plt.legend()
plt.show()
# Train the ARIMA model on the training set
order = (1, 1, 1) # ARIMA(p, d, q) order
model = ARIMA(train['Temperature'], order=order)
fit_model = model.fit()
predictions = fit_model.forecast(steps=len(test))
# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)
rmse = sqrt(mean_squared_error(test['Temperature'], predictions))
# Plot the predicted temperatures against the actual temperatures
plt.plot(test.index, test['Temperature'], label='Actual Temperatures')
plt.plot(test.index, predictions, label='Predicted Temperatures', linestyle='dashed')
plt.title(f'ARIMA Forecasting for Daily Temperatures - RMSE: {rmse:.4f}')
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.legend()
plt.show()
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
Output:
57
58

Ds File

Uploaded by

Copyright:

Available Formats

Ds File

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Ds File

Uploaded by

Copyright:

Available Formats

1.Write python code to flatten and evaluate a deep tree in NLP.

return Tree(tree.label(), children)

print("Deep Tree:\n", treebank.parsed_sents()[0])

print ("height of tree : ", treebank.parsed_sents()[0].height())

# Read red wine data

# Display five rows of red wine data

# Read white wine data

# Display five rows of white wine data

# Check for NULL values in red wine data

# Check for NULL values in white wine data

# Create a histogram for red wine's alcohol content

import matplotlib.pyplot as plt

# Create a histogram for White wine's alcohol content

from sklearn.model_selection import train_test_split

# Assuming 'quality' is the target variable for red wine

# Split red wine data into training and validation sets

# Assuming 'quality' is the target variable for white wine

# Split white wine data into training and validation sets

# this file is located in pytorch tutorial/image

# make sure path must correct

# CONSTANTS because of architecture what we are using

# Device configuration snippet

def PretrainedResNet(image_path, encoder_path=ENCODER_PATH,

# Load vocabulary wrapper

# eval mode (batchnorm uses moving mean/variance)

# Load the trained model parameters

# Generate a caption from the image

# (1, max_seq_length) -> (max_seq_length)

# Convert word_ids to words

#Test Image: Tony

#Test Image: Tony

# Step 2: Load the descriptions

# Step 3: Cleaning the text

# Step 4: Generate the Vocabulary

# Step 5: Load the images

# Step 6: Extract the feature vector from all images

# Step 7: Tokenizing the vocabulary

vocab = [word for word in word_counts if word_counts[word] >= threshold]

max_length = max(len(des.split()) for des in all_train_captions)

# Step 8: Glove vector embeddings

# Step 9: Define the model

decoder1 = add([fe2, se3])

model = Model(inputs=[ip1, ip2], outputs=outputs)

# Step 10: Training the model

for text in texts:

# Step 3: Creation of a unique word dictionary

# Step 4: Creating the X and Y matrices

for i, word_list in tqdm(enumerate(word_lists)):

# Step 5: Training and obtaining weights

# Obtaining the weights from the neural network.

# Step 6: Obtain the weights and plot the results

#2 : Let’s find the collocations

#3.from nltk.corpusimport stopwords

from nltk.corpus import webtext, stopwords

# Define the file path

#2 : Understanding Hypernerms and Hyponyms from nltk.corpus import wordnet

#3 : Part of Speech (POS) in Synset.

# Splitting the data

# Calculating Standard Deviation