Ds File

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 58

1.Write python code to flatten and evaluate a deep tree in NLP.

Code:
#Class for flattening the deep tree
from nltk.tree import Tree
def flatten_childtrees(trees):
children = []
for t in trees:
if t.height() < 3:
children.extend(t.pos())
elif t.height() == 3:
children.append(Tree(t.label(), t.pos()))
else:
children.extend(flatten_childtrees(t))
return children

def flatten_deeptree(tree):
return Tree(tree.label(), flatten_childtrees(tree))

# Evaluating flatten_deeptree()
from nltk.corpus import treebank

deep_tree = treebank.parsed_sents()[0]
flattened_tree = flatten_deeptree(deep_tree)
print("DeepTree:\n", deep_tree)
print("\nFlattened Tree:\n", flattened_tree)

#height
from nltk.corpus import treebank
from transforms import flatten_deeptree
from nltk.tree import Tree
print("Height:", Tree('NNP', ['Pierre']).height())

1
print("\nHeight:", Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height())

Output:

2
2. Create Shallow Tree in NLP and prints its height.
Code:
#Understanding shallow_tree()
from nltk.tree import Tree

def shallow_tree(tree):
children = []

for t in tree:
if t.height() < 3:
children.extend(t.pos())
else:
children.append(Tree(t.label(), t.pos()))

return Tree(tree.label(), children)

#Evaluating
from transforms import shallow_tree
from nltk.corpus import treebank

print("Deep Tree:\n", treebank.parsed_sents()[0])


print("\nShallow Tree:\n", shallow_tree(treebank.parsed_sents()[0]))

print ("height of tree : ", treebank.parsed_sents()[0].height())


print ("\nheight of shallow tree : ", shallow_tree(treebank.parsed_sents()[0]).height())

3
Output:

4
3. Download wine quality data set from the UCI Machine Learning Repository which is
available for free. Then print data of five rows of red and white wines. Check for NULL
Values in red wine. Create a histogram to show distribution of alcohol and finally split
the data for training and validation.
Code:
import pandas as pd

# Read red wine data


ds_red = pd.read_csv("winequality-red.csv", sep=';')

# Display five rows of red wine data


print("Red Wine Data:")

print(ds_red.head())

# Read white wine data


ds_wht = pd.read_csv("winequality-white.csv", sep=';')

# Display five rows of white wine data


print("\nWhite Wine Data:")

print(ds_wht.head())

# Check for NULL values in red wine data


print("\nNull Values in Red Wine Data:")
print(ds_red.isnull().sum())

# Check for NULL values in white wine data


print("\nNull Values in White Wine Data:")
print(ds_wht.isnull().sum())

5
import matplotlib.pyplot as plt

# Create a histogram for red wine's alcohol content


plt.hist(ds_red['alcohol'], bins=20, color="red", alpha=0.7, label='Red Wine')
plt.xlabel('Alcohol Content')
plt.ylabel('Frequency')
plt.title('Distribution of Alcohol Content in Red Wine')
plt.legend()
plt.show()

import matplotlib.pyplot as plt

# Create a histogram for White wine's alcohol content


plt.hist(ds_wht['alcohol'], bins=20, color="blue", alpha=0.7, label='White Wine')
plt.xlabel('Alcohol Content')
plt.ylabel('Frequency')
plt.title('Distribution of Alcohol Content in White Wine')
plt.legend()
plt.show()

from sklearn.model_selection import train_test_split

# Assuming 'quality' is the target variable for red wine


X_red = ds_red.drop('quality', axis=1)
y_red = ds_red['quality']

# Split red wine data into training and validation sets


X_train_red, X_valid_red, y_train_red, y_valid_red = train_test_split(X_red, y_red,
test_size=0.2, random_state=42)

# Assuming 'quality' is the target variable for white wine

6
X_wht = ds_wht.drop('quality', axis=1)
y_wht = ds_wht['quality']

# Split white wine data into training and validation sets


X_train_wht, X_valid_wht, y_train_wht, y_valid_wht = train_test_split(X_wht, y_wht,
test_size=0.2, random_state=42)

Output:

7
8
9
4. Avengers Endgame and Deep Learning. Write python code to implement Image
Caption Generation using the Avengers End Games Characters.
Code:
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from PIL import Image

# this file is located in pytorch tutorial/image


# captioning which we pull from git remember
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN

# Model path

# make sure path must correct


ENCODER_PATH = 'content/encoder-5-3000.pkl'
DECODER_PATH = 'content/decoder-5-3000.pkl'
VOCAB_PATH = 'content/vocab.pkl'

# CONSTANTS because of architecture what we are using


EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1

# Device configuration snippet


device = torch.cuda.device(0) # 0 represent default device

10
# Function to Load and Resize the image
def load_image(image_path, transform=None):
image = Image.open(image_path)
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image

def PretrainedResNet(image_path, encoder_path=ENCODER_PATH,


decoder_path=DECODER_PATH,
vocab_path=VOCAB_PATH,
embed_size=EMBED_SIZE,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYERS):

# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])

# Load vocabulary wrapper


with open(vocab_path, 'rb') as f:
vocab = pickle.load(f)

# Build models

# eval mode (batchnorm uses moving mean/variance)

11
encoder = EncoderCNN(embed_size).eval()
decoder = DecoderRNN(embed_size, hidden_size,
len(vocab), num_layers)

encoder = encoder.to(device)
decoder = decoder.to(device)

# Load the trained model parameters


encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

# Prepare an image
image = load_image(image_path, transform)
image_tensor = image.to(device)

# Generate a caption from the image


feature = encoder(image_tensor)
sampled_ids = decoder.sample(feature)

# (1, max_seq_length) -> (max_seq_length)


sampled_ids = sampled_ids[0].cpu().numpy()

# Convert word_ids to words


sampled_caption = []
for word_id in sampled_ids:
word = vocab.idx2word[word_id]
sampled_caption.append(word)
if word == '<end>':
break
sentence = ' '.join(sampled_caption)[8:-5].title()

12
# Print out the image and the generated caption
image = Image.open(image_path)
return sentence, image

#Test Image: Tony


plt.figure(figsize=(24,24))
predicted_label, image = PretrainedResNet(image_path='tony.jpg')
plt.imshow(image)
print(predicted_label)

#Test Image: Tony


plt.figure(figsize=(24,24))
predicted_label, image = PretrainedResNet(image_path='thor.jpg')
plt.imshow(image)
print(predicted_label)

Output:

13
5. Create a Neural network using Python (you can use NumPy to implement this).
Code:
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import cv2
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.layers.merge import add
from keras.utils import to_categorical, plot_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import matplotlib.pyplot as plt
import glob

# Step 2: Load the descriptions


def load_description(text):
mapping = dict()
for line in text.split("\n"):
token = line.split("\t")
if len(line) < 2: # remove short descriptions
continue
img_id = token[0].split('.')[0] # name of the image
img_des = token[1] # description of the image
if img_id not in mapping:
mapping[img_id] = list()

14
mapping[img_id].append(img_des)
return mapping

token_path =
'/kaggle/input/flickr8k/flickr_data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'
text = open(token_path, 'r', encoding='utf-8').read()
descriptions = load_description(text)
print(descriptions['1000268201_693b08cb0e'])

# Step 3: Cleaning the text


def clean_description(desc):
for key, des_list in desc.items():
for i in range(len(des_list)):
caption = des_list[i]
caption = [ch for ch in caption if ch not in string.punctuation]
caption = ''.join(caption)
caption = caption.split(' ')
caption = [word.lower() for word in caption if len(word) > 1 and word.isalpha()]
caption = ' '.join(caption)
des_list[i] = caption

clean_description(descriptions)
print(descriptions['1000268201_693b08cb0e'])

# Step 4: Generate the Vocabulary


def to_vocab(desc):
words = set()
for key in desc.keys():
for line in desc[key]:
words.update(line.split())
return words

15
vocab = to_vocab(descriptions)

# Step 5: Load the images


images = '/kaggle/input/flickr8k/flickr_data/Flickr_Data/Images/'
img = glob.glob(images + '*.jpg')
train_path =
'/kaggle/input/flickr8k/flickr_data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
train_images = open(train_path, 'r', encoding='utf-8').read().split("\n")
train_img = [] # list of all images in the training set
for im in img:
if im[len(images):] in train_images:
train_img.append(im)

# Step 6: Extract the feature vector from all images


from keras.preprocessing.image import load_img, img_to_array

def preprocess_img(img_path):
img = load_img(img_path, target_size=(299, 299))
x = img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
return x

def encode(image):
image = preprocess_img(image)
vec = model.predict(image)
vec = np.reshape(vec, (vec.shape[1]))
return vec

base_model = InceptionV3(weights='imagenet')

16
model = Model(base_model.input, base_model.layers[-2].output)

encoding_train = {}
for img in train_img:
encoding_train[img[len(images):]] = encode(img)

# Step 7: Tokenizing the vocabulary


all_train_captions = []
for key, val in train_descriptions.items():
for caption in val:
all_train_captions.append(caption)

vocabulary = vocab
threshold = 10
word_counts = {}
for cap in all_train_captions:
for word in cap.split(' '):
word_counts[word] = word_counts.get(word, 0) + 1

vocab = [word for word in word_counts if word_counts[word] >= threshold]

ixtoword = {}
wordtoix = {}
ix = 1
for word in vocab:
wordtoix[word] = ix
ixtoword[ix] = word
ix += 1

max_length = max(len(des.split()) for des in all_train_captions)

17
max_length

# Step 8: Glove vector embeddings


X1, X2, y = list(), list(), list()
for key, des_list in train_descriptions.items():
pic = train_features[key + '.jpg']
for cap in des_list:
seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix]
for i in range(1, len(seq)):
in_seq, out_seq = seq[:i], seq[i]
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
X1.append(pic)
X2.append(in_seq)
y.append(out_seq)

X2 = np.array(X2)
X1 = np.array(X1)
y = np.array(y)

embeddings_index = {}
glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
glove = open(glove_path, 'r', encoding='utf-8').read()
for line in glove.split("\n"):
values = line.split(" ")
word = values[0]
indices = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = indices

emb_dim = 200

18
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, i in wordtoix.items():
emb_vec = embeddings_index.get(word)
if emb_vec is not None:
emb_matrix[i] = emb_vec

emb_matrix.shape

# Step 9: Define the model


ip1 = Input(shape=(2048,))
fe1 = Dropout(0.2)(ip1)
fe2 = Dense(256, activation='relu')(fe1)

ip2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, emb_dim, mask_zero=True)(ip2)
se2 = Dropout(0.2)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])


decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[ip1, ip2], outputs=outputs)

# Step 10: Training the model


model.layers[2].set_weights([emb_matrix])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit([X1, X2], y, epochs=50, batch_size=256)

19
# Step 11: Predicting the output
def greedy_search(pic):
start = 'startseq'
for i in range(max_length):
seq = [wordtoix[word] for word in start.split() if word in wordtoix]
seq = pad_sequences([seq], maxlen=max_length)
yhat = model.predict([pic, seq])
yhat = np.argmax(yhat)
word = ixtoword[yhat]
start += ' ' + word
if word == 'endseq':
break
final = start.split()
final = final[1:-1]
final = ' '.join(final)
return final

Output:

20
6. Implement Word Embedding using Word2Vec.
Code:
import re
def clean_text(string, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''', stop_words=['the', 'a',
'and', 'is', 'be', 'will']):
# Cleaning the urls
string = re.sub(r'https?://\S+|www\.\S+', '', string)
# Cleaning the html elements
string = re.sub(r'<.*?>', '', string)
# Removing the punctuations
for x in string.lower():
if x in punctuations:
string = string.replace(x, "")
# Converting the text to lower
string = string.lower()
# Removing stop words
string = ' '.join([word for word in string.split() if word not in stop_words])
# Cleaning the whitespaces
string = re.sub(r'\s+', ' ', string).strip()
return string

# Step 2: The full pipeline to create the (X, Y) word pairs given a list of strings texts:
window = 2
word_lists = []
all_text = []

for text in texts:


# Cleaning the text
text = clean_text(text)
# Appending to the all text list
all_text += text

21
# Creating a context dictionary
for i, word in enumerate(text):
for w in range(window):
# Getting the context that is ahead by *window* words
if i + 1 + w < len(text):
word_lists.append([word] + [text[(i + 1 + w)]])
# Getting the context that is behind by *window* words
if i - w - 1 >= 0:
word_lists.append([word] + [text[(i - w - 1)]])

# Step 3: Creation of a unique word dictionary


def create_unique_word_dict(text):
"""
A method that creates a dictionary where the keys are unique words
and key values are indices
"""
words = list(set(text))
words.sort()
unique_word_dict = {}
for i, word in enumerate(words):
unique_word_dict.update({word: i})
return unique_word_dict

# Step 4: Creating the X and Y matrices


n_words = len(unique_word_dict)
words = list(unique_word_dict.keys())
X = []
Y = []

for i, word_list in tqdm(enumerate(word_lists)):

22
main_word_index = unique_word_dict.get(word_list[0])
context_word_index = unique_word_dict.get(word_list[1])
X_row = np.zeros(n_words)
Y_row = np.zeros(n_words)
X_row[main_word_index] = 1
Y_row[context_word_index] = 1
X.append(X_row)
Y.append(Y_row)

X = np.asarray(X)
Y = np.asarray(Y)

# Step 5: Training and obtaining weights


embed_size = 2
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(x=X, y=Y, batch_size=256, epochs=1000)

# Obtaining the weights from the neural network.


weights = model.get_weights()[0]
embedding_dict = {}
for word in words:
embedding_dict.update({word: weights[unique_word_dict.get(word)]})

# Step 6: Obtain the weights and plot the results


import matplotlib.pyplot as plt

23
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
coord = embedding_dict.get(word)
plt.scatter(coord[0], coord[1])
plt.annotate(word, (coord[0], coord[1]))

plt.show()

Output:

24
7. Collocations are two or more words that tend to appear frequently together, for
example – United States. Implement this using Python.
Code:
#Loading libraries
from nltk.corpus import webtext
# use to find bigrams, which are pairs of words
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

#2 : Let’s find the collocations


# Loadingthe data
file_path = 'C:/Users/harsh/OneDrive/Desktop/grail.txt'
words = [w.lower() for w in webtext.words(file_path)]
bigram_collocation = BigramCollocationFinder.from_words(words)
bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)

#3.from nltk.corpusimport stopwords


from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bigram_collocation.apply_word_filter(filter_stops)
bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)

from nltk.corpus import webtext, stopwords


from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

# Define the file path


file_path = 'C:/Users/harsh/OneDrive/Desktop/grail.txt'

25
words = [w.lower() for w in webtext.words(file_path)]
trigram_collocation =TrigramCollocationFinder.from_words(words)
trigram_collocation.apply_word_filter(filter_stops)
trigram_collocation.apply_freq_filter(3)
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 15)

Output:

26
8. WordNet is the lexical database i.e. dictionary for the English language, specifically
designed for natural language processing. Synset is a special kind of a simple interface
that is present in NLTK to look up words in WordNet. Synset instances are the
groupings of synonymous words that express the same concept Show working of these
using Python.
Code:
#1 : Understanding Synset
from nltk.corpus import wordnet
syn = wordnet.synsets("hello")[0]
print("Synsetname : ", syn.name())
# Defining the word
print("\nSynset meaning:", syn.definition())
# listof phrases that use the word in context
print("\nSynsetexample:", syn.examples())

#2 : Understanding Hypernerms and Hyponyms from nltk.corpus import wordnet


syn = wordnet.synsets('hello')[0]
print("Synsetname :", syn.name())
print("\nSynsetabstractterm: ", syn.hypernyms())
print("\nSynsetspecificterm: ",
syn.hypernyms()[0].hyponyms())
syn.root_hypernyms()
print("\nSynsetroothypernerm:", syn.root_hypernyms())

#3 : Part of Speech (POS) in Synset.


syn = wordnet.synsets('hello')[0]
print("Syntag: ", syn.pos())
syn = wordnet.synsets('doing')[0]
print("Syntag:", syn.pos())
syn = wordnet.synsets('beautiful')[0]
print("Syntag: ", syn.pos())
syn = wordnet.synsets('quickly')[0]

27
print("Syntag: ", syn.pos())

Output:

28
9. Implement Naïve Baye’s Classifier using python.
Code:
# Importing libraries
import math
import random
import csv

# Encode class names to numeric values (e.g., yes and no encoded to 1 and 0)
def encode_class(mydata):
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata

# Splitting the data


def splitting(mydata, ratio):
train_num = int(len(mydata) * ratio)
train = []
test = list(mydata) # initially testset will have all the dataset
while len(train) < train_num:
index = random.randrange(len(test)) # index generated randomly from range 0 to length
of testset
train.append(test.pop(index))
return train, test

# Group the data rows under each class (e.g., yes or no) in dictionary (dict[yes] and dict[no])

29
def groupUnderClass(mydata):
dict = {}
for i in range(len(mydata)):
if mydata[i][-1] not in dict:
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict

# Calculating Mean
def mean(numbers):
return sum(numbers) / float(len(numbers))

# Calculating Standard Deviation


def std_dev(numbers):
if len(numbers) < 2:
return 0
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)

# Calculate Mean and Standard Deviation


def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
del info[-1] # delete summaries of the last class
return info

# Find Mean and Standard Deviation under each class


def MeanAndStdDevForClass(mydata):
info = {}
dict = groupUnderClass(mydata)

30
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info

# Calculate Gaussian Probability Density Function


def calculateGaussianProbability(x, mean, stdev):
expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

# Calculate Class Probabilities


def calculateClassProbabilities(info, test):
probabilities = {}
for classValue, classSummaries in info.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, std_dev = classSummaries[i]
x = test[i]
probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
return probabilities

# Make prediction - highest probability is the prediction


def predict(info, test):
probabilities = calculateClassProbabilities(info, test)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel

31
# Returns predictions for a set of examples
def getPredictions(info, test):
predictions = []
for i in range(len(test)):
result = predict(info, test[i])
predictions.append(result)
return predictions

# Accuracy score
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0

# Driver code
filename = 'nb.csv'
# Load the file and store it in mydata list
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
mydata[i] = [float(x) for x in mydata[i]]

# Split ratio = 0.7 (70% of data is training data and 30% is test data used for testing)
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))

32
print("Test examples are: ", len(test_data))

# Prepare model
info = MeanAndStdDevForClass(train_data)

# Test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)

Output:

33
10. Twitter Sentiment Analysis using Python. Fetch tweets from twitter using python
and implement it.
Code:
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob

class TwitterClient(object):
'''Generic Twitter Class for sentiment analysis. '''
def __init__(self):
'''
Class constructor or initialization method.
'''
# keys and tokens from the Twitter Dev Console
consumer_key = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
consumer_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'
access_token = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
access_token_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'

# attempt authentication
try:
# create OAuthHandler object
self.auth = OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
self.auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
self.api = tweepy.API(self.auth)
except:
print("Error: Authentication Failed")

34
def clean_tweet(self, tweet):
''' Utility function to clean tweet text by removing links, special characters
using simple regex statements. '''
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
tweet).split())

def get_tweet_sentiment(self, tweet):


'''
Utility function to classify sentiment of passed tweet
using textblob's sentiment method.
'''
# create TextBlob object of passed tweet text
analysis = TextBlob(self.clean_tweet(tweet))
# set sentiment
if analysis.sentiment.polarity > 0:
return 'positive'
elif analysis.sentiment.polarity == 0:
return 'neutral'
else:
return 'negative'

def get_tweets(self, query, count=10):


'''
Main function to fetch tweets and parse them.
'''
# empty list to store parsed tweets
tweets = []
try:
# call twitter api to fetch tweets
fetched_tweets = self.api.search(q=query, count=count)
# parsing tweets one by one

35
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
# saving sentiment of tweet
parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
# appending parsed tweet to tweets list
if tweet.retweet_count > 0:
# if tweet has retweets, ensure that it is appended only once
if parsed_tweet not in tweets:
tweets.append(parsed_tweet)
else:
tweets.append(parsed_tweet)
# return parsed tweets
return tweets
except tweepy.TweepError as e:
# print error (if any)
print("Error: " + str(e))

def main():
# creating object of TwitterClient Class
api = TwitterClient()
# calling function to get tweets
tweets = api.get_tweets(query='Donald Trump', count=200)
# picking positive tweets from tweets
ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
# percentage of positive tweets
print("Positive tweets percentage: {} %".format(100 * len(ptweets) / len(tweets)))
# picking negative tweets from tweets

36
ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
# percentage of negative tweets
print("Negative tweets percentage: {} %".format(100 * len(ntweets) / len(tweets)))
# percentage of neutral tweets
print("Neutral tweets percentage: {} %".format(100 * (len(tweets) - (len(ntweets) +
len(ptweets))) / len(tweets)))

# printing first 10 positive tweets


print("\n\nPositive tweets:")
for tweet in ptweets[:10]:
print(tweet['text'])

# printing first 10 negative tweets


print("\n\nNegative tweets:")
for tweet in ntweets[:10]:
print(tweet['text'])

if __name__ == "__main__":
# calling main function
main()

Output:

37
11. Build a predictive model to forecast sales based on historical data?

Code:

# Import necessary libraries

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

# Generate sample data (you would replace this with your actual dataset)

np.random.seed(42)

data = {'Month': pd.date_range(start='2022-01-01', periods=12, freq='M'),

'Sales': np.random.randint(50, 200, size=12)}

df = pd.DataFrame(data)

# Feature engineering (you might have more features in a real dataset)

df['Month_Num'] = df['Month'].dt.month

df['Month_Num^2'] = df['Month_Num'] ** 2

# Split the data into training and testing sets

X = df[['Month_Num', 'Month_Num^2']]

y = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model

model = LinearRegression()

model.fit(X_train, y_train)

# Make predictions on the test set

38
y_pred = model.predict(X_test)

# Evaluate the model

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

print(f'R-squared: {r2}')

# Visualize the predictions

plt.scatter(X_test['Month_Num'], y_test, color='black', label='Actual')

plt.plot(X_test['Month_Num'], y_pred, color='blue', linewidth=3, label='Predicted')

plt.title('Sales Prediction')

plt.xlabel('Month')

plt.ylabel('Sales')

plt.legend()

plt.show()

Output:

39
12. Predict the outcome of a binary classification problem using a
machine learning algorithm?

Code:

# Import necessary libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset

iris = load_iris()

X = iris.data

y = (iris.target == 0).astype(int) # 1 if the species is setosa, 0 otherwise

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model

model = LogisticRegression()

model.fit(X_train, y_train)

# Make predictions on the test set

y_pred = model.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

conf_matrix = confusion_matrix(y_test, y_pred)

classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')

40
print(f'Confusion Matrix:\n{conf_matrix}')

print(f'Classification Report:\n{classification_rep}')

41
13. perform sentiment analysis on a set of text data to understand the overall
sentiment

Code:

pip install nltk

import nltk

from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis

nltk.download('vader_lexicon')

# Sample text data

text_data = [

"I love this product! It's amazing.",

"The customer service was terrible.",

"Neutral comment without much emotion.",

"This is not bad, but could be better."

# Initialize the VADER sentiment intensity analyzer

sid = SentimentIntensityAnalyzer()

# Analyze sentiment for each text

for text in text_data:

sentiment_scores = sid.polarity_scores(text)

print(f"Text: {text}")

print(f"Sentiment Scores: {sentiment_scores}")

# Determine the overall sentiment

42
if sentiment_scores['compound'] >= 0.05:

sentiment = 'Positive'

elif sentiment_scores['compound'] <= -0.05:

sentiment = 'Negative'

else:

sentiment = 'Neutral'

print(f"Overall Sentiment: {sentiment}\n")

43
14. Identify distinct customer segments based on their behaviour or
characteristics?

Code:

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# Generate sample data (replace this with your actual customer data)

np.random.seed(42)

data = pd.DataFrame({

'Feature1': np.random.randint(1, 100, 100),

'Feature2': np.random.randint(1, 100, 100),

'Feature3': np.random.randint(1, 100, 100)

})

# Standardize the data

scaler = StandardScaler()

scaled_data = scaler.fit_transform(data)

# Apply k-means clustering

kmeans = KMeans(n_clusters=3, random_state=42)

data['Cluster'] = kmeans.fit_predict(scaled_data)

# Reduce dimensions for visualization

pca = PCA(n_components=2)

reduced_data = pca.fit_transform(scaled_data)

# Visualize the clusters

44
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=data['Cluster'], cmap='viridis')

plt.title('Customer Segmentation')

plt.xlabel('Principal Component 1')

plt.ylabel('Principal Component 2')

plt.show()

45
15. Optimize hyperparameters of a model to achieve better performance.

Code:

# Import necessary libraries

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

# Generate a hypothetical dataset (replace this with your actual dataset)

np.random.seed(42)

X = np.random.rand(100, 5) # Features

y = np.random.randint(0, 2, 100) # Binary labels

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the SVM model

svm_model = SVC()

# Define hyperparameters and their possible values for grid search

param_grid = {

'C': [0.1, 1, 10],

'kernel': ['linear', 'rbf', 'poly'],

'gamma': ['scale', 'auto'],

# Perform grid search with 5-fold cross-validation

grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

46
# Get the best hyperparameters from the grid search

best_params = grid_search.best_params_

# Use the best model to make predictions on the test set

y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model performance

accuracy = accuracy_score(y_test, y_pred)

print(f'Best Hyperparameters: {best_params}')

print(f'Test Set Accuracy: {accuracy}')

Output:

47
16. Create informative and visually appealing plots to represent patterns and
relationships in the data.

Code:

import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

# Generate a sample dataset

np.random.seed(42)

data_size = 100

feature1 = np.random.rand(data_size)

feature2 = 2 * feature1 + np.random.normal(scale=0.2, size=data_size)

category = np.random.choice(['A', 'B'], size=data_size)

df = pd.DataFrame({'Feature1': feature1, 'Feature2': feature2, 'Category': category})

# Set the style for the plots

sns.set(style="whitegrid")

# Pairplot to visualize relationships between numerical features

sns.pairplot(df, hue='Category', palette='Set1')

plt.suptitle('Pairplot of Numerical Features', y=1.02)

plt.show()

# Scatter plot with regression line

plt.figure(figsize=(8, 6))

sns.regplot(x='Feature1', y='Feature2', data=df, scatter_kws={'s': 50, 'alpha': 0.7}, line_kws={'color':


'red'})

plt.title('Scatter Plot with Regression Line')

plt.show()

48
# Box plot to visualize distributions

plt.figure(figsize=(8, 6))

sns.boxplot(x='Category', y='Feature1', data=df, palette='Set2')

plt.title('Box Plot of Feature1 by Category')

plt.show()

# Violin plot to show the distribution of a numerical variable across different categories

plt.figure(figsize=(10, 8))

sns.violinplot(x='Category', y='Feature2', data=df, inner='quartile', palette='Pastel1')

plt.title('Violin Plot of Feature2 by Category')

plt.show()

Output:

49
17. Explore techniques to handle missing data and outliers in a dataset.

Code:

import pandas as pd

import numpy as np

import seaborn as sns

from scipy import stats

from scipy.stats.mstats import winsorize

import matplotlib.pyplot as plt

# Generate a sample dataset with missing data and outliers

np.random.seed(42)

data = {

'Feature1': np.random.randint(1, 100, 50),

'Feature2': np.concatenate([np.random.normal(50, 10, 25), [np.nan] * 25]),

'Feature3': np.concatenate([np.random.normal(100, 20, 25), np.random.normal(20, 5, 25)])

df = pd.DataFrame(data)

# Handling Missing Data

df.fillna(df.mean(), inplace=True) # Impute missing values with mean

# Handling Outliers

# Using Winsorizing to clip extreme values

df['Feature2'] = winsorize(df['Feature2'], limits=[0.05, 0.05])

# Using Z-scores to identify and remove outliers

z_scores = np.abs(stats.zscore(df['Feature3']))

df_no_outliers = df[(z_scores < 3)]

50
# Visualizing the original and cleaned data

plt.figure(figsize=(12, 6))

plt.subplot(2, 2, 1)

sns.boxplot(x=df['Feature2'])

plt.title('Original Data - Feature2')

plt.subplot(2, 2, 2)

sns.boxplot(x=df_no_outliers['Feature2'])

plt.title('Data without Outliers - Feature2')

plt.subplot(2, 2, 3)

sns.histplot(df['Feature3'], kde=True)

plt.title('Original Data - Feature3')

plt.subplot(2, 2, 4)

sns.histplot(df_no_outliers['Feature3'], kde=True)

plt.title('Data without Outliers - Feature3')

plt.tight_layout()

plt.show()

Output:

51
18.

Code:

import nltk

from nltk import word_tokenize, pos_tag, ne_chunk

import spacy

nltk.download('punkt')

nltk.download('maxent_ne_chunker')

nltk.download('words')

# Define a function to extract entities using NLTK

def extract_entities_nltk(text):

words = word_tokenize(text)

tagged_words = pos_tag(words)

named_entities = ne_chunk(tagged_words)

entities = []

for entity in named_entities:

if isinstance(entity, nltk.Tree):

entities.append(" ".join([word for word, tag in entity.leaves()])

return entities

# Define a function to extract entities using spaCy

def extract_entities_spacy(text):

nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

entities = [ent.text for ent in doc.ents]

return entities

# Sample text

text = "Apple Inc. is planning to open a new store in New York City."

# Extract entities using NLTK

entities_nltk = extract_entities_nltk(text)

52
print("Named Entities (NLTK):", entities_nltk)

# Extract entities using spaCy

entities_spacy = extract_entities_spacy(text)

print("Named Entities (spaCy):", entities_spacy)

Output:

19. Build a forecasting model for a time series dataset, and evaluate its
accuracy.

Code:

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA

from sklearn.metrics import mean_squared_error

from math import sqrt

# Generate a hypothetical time series dataset

np.random.seed(42)

date_rng = pd.date_range(start='2022-01-01', end='2022-12-31', freq='D')

time_series_data = np.random.randn(len(date_rng)) + np.arange(len(date_rng)) * 0.2

# Create a DataFrame

df = pd.DataFrame(index=date_rng, data={'Value': time_series_data})

# Split the data into training and test sets

train_size = int(len(df) * 0.8)

53
train, test = df.iloc[:train_size], df.iloc[train_size:]

# Plot the training and test sets

plt.figure(figsize=(12, 6))

plt.plot(train.index, train['Value'], label='Training Set')

plt.plot(test.index, test['Value'], label='Test Set')

plt.title('Hypothetical Time Series Data - Training and Test Sets')

plt.xlabel('Date')

plt.ylabel('Value')

plt.legend()

plt.show()

# Train the ARIMA model on the training set

order = (1, 1, 1) # ARIMA(p, d, q) order

model = ARIMA(train['Value'], order=order)

fit_model = model.fit()

# Make predictions on the test set

predictions = fit_model.forecast(steps=len(test))

# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)

rmse = sqrt(mean_squared_error(test['Value'], predictions))

# Plot the predicted values against the actual values

plt.figure(figsize=(12, 6))

plt.plot(test.index, test['Value'], label='Actual Values')

plt.plot(test.index, predictions, label='Predicted Values', linestyle='dashed')

plt.title(f'ARIMA Forecasting - RMSE: {rmse:.4f}')

plt.xlabel('Date')

plt.ylabel('Value')

plt.legend()

54
plt.show()

print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

Output:

55
20. Design and implement a time series forecasting model to predict future
temperatures. Evaluate the model's performance and visualize the predictions
against the actual temperatures.

Code:

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA

from sklearn.metrics import mean_squared_error

from math import sqrt

# Generate a hypothetical time series dataset of daily temperatures

np.random.seed(42)

date_rng = pd.date_range(start='2010-01-01', end='2022-12-31', freq='D')

temperature_data = 25 + 0.5 * np.sin(np.arange(len(date_rng)) * (2 * np.pi / 365)) +


np.random.normal(scale=2, size=len(date_rng))

# Create a DataFrame

temperature_df = pd.DataFrame(index=date_rng, data={'Temperature': temperature_data})

# Split the data into training and test sets

train_size = int(len(temperature_df) * 0.8)

train, test = temperature_df.iloc[:train_size], temperature_df.iloc[train_size:]

# Plot the training and test sets

plt.figure(figsize=(12, 6))

plt.plot(train.index, train['Temperature'], label='Training Set')

plt.plot(test.index, test['Temperature'], label='Test Set')

plt.title('Hypothetical Daily Temperature Time Series Data - Training and Test Sets')

plt.xlabel('Date')

plt.ylabel('Temperature')

56
plt.legend()

plt.show()

# Train the ARIMA model on the training set

order = (1, 1, 1) # ARIMA(p, d, q) order

model = ARIMA(train['Temperature'], order=order)

fit_model = model.fit()

# Make predictions on the test set

predictions = fit_model.forecast(steps=len(test))

# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)

rmse = sqrt(mean_squared_error(test['Temperature'], predictions))

# Plot the predicted temperatures against the actual temperatures

plt.figure(figsize=(12, 6))

plt.plot(test.index, test['Temperature'], label='Actual Temperatures')

plt.plot(test.index, predictions, label='Predicted Temperatures', linestyle='dashed')

plt.title(f'ARIMA Forecasting for Daily Temperatures - RMSE: {rmse:.4f}')

plt.xlabel('Date')

plt.ylabel('Temperature')

plt.legend()

plt.show()

print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

Output:

57
58

You might also like