Ds File
Ds File
Ds File
Code:
#Class for flattening the deep tree
from nltk.tree import Tree
def flatten_childtrees(trees):
children = []
for t in trees:
if t.height() < 3:
children.extend(t.pos())
elif t.height() == 3:
children.append(Tree(t.label(), t.pos()))
else:
children.extend(flatten_childtrees(t))
return children
def flatten_deeptree(tree):
return Tree(tree.label(), flatten_childtrees(tree))
# Evaluating flatten_deeptree()
from nltk.corpus import treebank
deep_tree = treebank.parsed_sents()[0]
flattened_tree = flatten_deeptree(deep_tree)
print("DeepTree:\n", deep_tree)
print("\nFlattened Tree:\n", flattened_tree)
#height
from nltk.corpus import treebank
from transforms import flatten_deeptree
from nltk.tree import Tree
print("Height:", Tree('NNP', ['Pierre']).height())
1
print("\nHeight:", Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height())
Output:
2
2. Create Shallow Tree in NLP and prints its height.
Code:
#Understanding shallow_tree()
from nltk.tree import Tree
def shallow_tree(tree):
children = []
for t in tree:
if t.height() < 3:
children.extend(t.pos())
else:
children.append(Tree(t.label(), t.pos()))
#Evaluating
from transforms import shallow_tree
from nltk.corpus import treebank
3
Output:
4
3. Download wine quality data set from the UCI Machine Learning Repository which is
available for free. Then print data of five rows of red and white wines. Check for NULL
Values in red wine. Create a histogram to show distribution of alcohol and finally split
the data for training and validation.
Code:
import pandas as pd
print(ds_red.head())
print(ds_wht.head())
5
import matplotlib.pyplot as plt
6
X_wht = ds_wht.drop('quality', axis=1)
y_wht = ds_wht['quality']
Output:
7
8
9
4. Avengers Endgame and Deep Learning. Write python code to implement Image
Caption Generation using the Avengers End Games Characters.
Code:
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from PIL import Image
# Model path
10
# Function to Load and Resize the image
def load_image(image_path, transform=None):
image = Image.open(image_path)
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Build models
11
encoder = EncoderCNN(embed_size).eval()
decoder = DecoderRNN(embed_size, hidden_size,
len(vocab), num_layers)
encoder = encoder.to(device)
decoder = decoder.to(device)
# Prepare an image
image = load_image(image_path, transform)
image_tensor = image.to(device)
12
# Print out the image and the generated caption
image = Image.open(image_path)
return sentence, image
Output:
13
5. Create a Neural network using Python (you can use NumPy to implement this).
Code:
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import cv2
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.layers.merge import add
from keras.utils import to_categorical, plot_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import matplotlib.pyplot as plt
import glob
14
mapping[img_id].append(img_des)
return mapping
token_path =
'/kaggle/input/flickr8k/flickr_data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'
text = open(token_path, 'r', encoding='utf-8').read()
descriptions = load_description(text)
print(descriptions['1000268201_693b08cb0e'])
clean_description(descriptions)
print(descriptions['1000268201_693b08cb0e'])
15
vocab = to_vocab(descriptions)
def preprocess_img(img_path):
img = load_img(img_path, target_size=(299, 299))
x = img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
return x
def encode(image):
image = preprocess_img(image)
vec = model.predict(image)
vec = np.reshape(vec, (vec.shape[1]))
return vec
base_model = InceptionV3(weights='imagenet')
16
model = Model(base_model.input, base_model.layers[-2].output)
encoding_train = {}
for img in train_img:
encoding_train[img[len(images):]] = encode(img)
vocabulary = vocab
threshold = 10
word_counts = {}
for cap in all_train_captions:
for word in cap.split(' '):
word_counts[word] = word_counts.get(word, 0) + 1
ixtoword = {}
wordtoix = {}
ix = 1
for word in vocab:
wordtoix[word] = ix
ixtoword[ix] = word
ix += 1
17
max_length
X2 = np.array(X2)
X1 = np.array(X1)
y = np.array(y)
embeddings_index = {}
glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
glove = open(glove_path, 'r', encoding='utf-8').read()
for line in glove.split("\n"):
values = line.split(" ")
word = values[0]
indices = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = indices
emb_dim = 200
18
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, i in wordtoix.items():
emb_vec = embeddings_index.get(word)
if emb_vec is not None:
emb_matrix[i] = emb_vec
emb_matrix.shape
ip2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, emb_dim, mask_zero=True)(ip2)
se2 = Dropout(0.2)(se1)
se3 = LSTM(256)(se2)
19
# Step 11: Predicting the output
def greedy_search(pic):
start = 'startseq'
for i in range(max_length):
seq = [wordtoix[word] for word in start.split() if word in wordtoix]
seq = pad_sequences([seq], maxlen=max_length)
yhat = model.predict([pic, seq])
yhat = np.argmax(yhat)
word = ixtoword[yhat]
start += ' ' + word
if word == 'endseq':
break
final = start.split()
final = final[1:-1]
final = ' '.join(final)
return final
Output:
20
6. Implement Word Embedding using Word2Vec.
Code:
import re
def clean_text(string, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''', stop_words=['the', 'a',
'and', 'is', 'be', 'will']):
# Cleaning the urls
string = re.sub(r'https?://\S+|www\.\S+', '', string)
# Cleaning the html elements
string = re.sub(r'<.*?>', '', string)
# Removing the punctuations
for x in string.lower():
if x in punctuations:
string = string.replace(x, "")
# Converting the text to lower
string = string.lower()
# Removing stop words
string = ' '.join([word for word in string.split() if word not in stop_words])
# Cleaning the whitespaces
string = re.sub(r'\s+', ' ', string).strip()
return string
# Step 2: The full pipeline to create the (X, Y) word pairs given a list of strings texts:
window = 2
word_lists = []
all_text = []
21
# Creating a context dictionary
for i, word in enumerate(text):
for w in range(window):
# Getting the context that is ahead by *window* words
if i + 1 + w < len(text):
word_lists.append([word] + [text[(i + 1 + w)]])
# Getting the context that is behind by *window* words
if i - w - 1 >= 0:
word_lists.append([word] + [text[(i - w - 1)]])
22
main_word_index = unique_word_dict.get(word_list[0])
context_word_index = unique_word_dict.get(word_list[1])
X_row = np.zeros(n_words)
Y_row = np.zeros(n_words)
X_row[main_word_index] = 1
Y_row[context_word_index] = 1
X.append(X_row)
Y.append(Y_row)
X = np.asarray(X)
Y = np.asarray(Y)
23
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
coord = embedding_dict.get(word)
plt.scatter(coord[0], coord[1])
plt.annotate(word, (coord[0], coord[1]))
plt.show()
Output:
24
7. Collocations are two or more words that tend to appear frequently together, for
example – United States. Implement this using Python.
Code:
#Loading libraries
from nltk.corpus import webtext
# use to find bigrams, which are pairs of words
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bigram_collocation.apply_word_filter(filter_stops)
bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)
25
words = [w.lower() for w in webtext.words(file_path)]
trigram_collocation =TrigramCollocationFinder.from_words(words)
trigram_collocation.apply_word_filter(filter_stops)
trigram_collocation.apply_freq_filter(3)
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 15)
Output:
26
8. WordNet is the lexical database i.e. dictionary for the English language, specifically
designed for natural language processing. Synset is a special kind of a simple interface
that is present in NLTK to look up words in WordNet. Synset instances are the
groupings of synonymous words that express the same concept Show working of these
using Python.
Code:
#1 : Understanding Synset
from nltk.corpus import wordnet
syn = wordnet.synsets("hello")[0]
print("Synsetname : ", syn.name())
# Defining the word
print("\nSynset meaning:", syn.definition())
# listof phrases that use the word in context
print("\nSynsetexample:", syn.examples())
27
print("Syntag: ", syn.pos())
Output:
28
9. Implement Naïve Baye’s Classifier using python.
Code:
# Importing libraries
import math
import random
import csv
# Encode class names to numeric values (e.g., yes and no encoded to 1 and 0)
def encode_class(mydata):
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata
# Group the data rows under each class (e.g., yes or no) in dictionary (dict[yes] and dict[no])
29
def groupUnderClass(mydata):
dict = {}
for i in range(len(mydata)):
if mydata[i][-1] not in dict:
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict
# Calculating Mean
def mean(numbers):
return sum(numbers) / float(len(numbers))
30
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info
31
# Returns predictions for a set of examples
def getPredictions(info, test):
predictions = []
for i in range(len(test)):
result = predict(info, test[i])
predictions.append(result)
return predictions
# Accuracy score
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
# Driver code
filename = 'nb.csv'
# Load the file and store it in mydata list
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
mydata[i] = [float(x) for x in mydata[i]]
# Split ratio = 0.7 (70% of data is training data and 30% is test data used for testing)
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
32
print("Test examples are: ", len(test_data))
# Prepare model
info = MeanAndStdDevForClass(train_data)
# Test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)
Output:
33
10. Twitter Sentiment Analysis using Python. Fetch tweets from twitter using python
and implement it.
Code:
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob
class TwitterClient(object):
'''Generic Twitter Class for sentiment analysis. '''
def __init__(self):
'''
Class constructor or initialization method.
'''
# keys and tokens from the Twitter Dev Console
consumer_key = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
consumer_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'
access_token = '1697506000599678976-S1MLs2RXogBYypr3hlXrOPpeRelRnx'
access_token_secret = 'rlQ2ZyrsiIrF1cwEnKZshNQAdCfZX5gIiKsDJajppgNfb'
# attempt authentication
try:
# create OAuthHandler object
self.auth = OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
self.auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
self.api = tweepy.API(self.auth)
except:
print("Error: Authentication Failed")
34
def clean_tweet(self, tweet):
''' Utility function to clean tweet text by removing links, special characters
using simple regex statements. '''
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
tweet).split())
35
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
# saving sentiment of tweet
parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
# appending parsed tweet to tweets list
if tweet.retweet_count > 0:
# if tweet has retweets, ensure that it is appended only once
if parsed_tweet not in tweets:
tweets.append(parsed_tweet)
else:
tweets.append(parsed_tweet)
# return parsed tweets
return tweets
except tweepy.TweepError as e:
# print error (if any)
print("Error: " + str(e))
def main():
# creating object of TwitterClient Class
api = TwitterClient()
# calling function to get tweets
tweets = api.get_tweets(query='Donald Trump', count=200)
# picking positive tweets from tweets
ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
# percentage of positive tweets
print("Positive tweets percentage: {} %".format(100 * len(ptweets) / len(tweets)))
# picking negative tweets from tweets
36
ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
# percentage of negative tweets
print("Negative tweets percentage: {} %".format(100 * len(ntweets) / len(tweets)))
# percentage of neutral tweets
print("Neutral tweets percentage: {} %".format(100 * (len(tweets) - (len(ntweets) +
len(ptweets))) / len(tweets)))
if __name__ == "__main__":
# calling main function
main()
Output:
37
11. Build a predictive model to forecast sales based on historical data?
Code:
import pandas as pd
import numpy as np
# Generate sample data (you would replace this with your actual dataset)
np.random.seed(42)
df = pd.DataFrame(data)
df['Month_Num'] = df['Month'].dt.month
df['Month_Num^2'] = df['Month_Num'] ** 2
X = df[['Month_Num', 'Month_Num^2']]
y = df['Sales']
model = LinearRegression()
model.fit(X_train, y_train)
38
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
plt.title('Sales Prediction')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.show()
Output:
39
12. Predict the outcome of a binary classification problem using a
machine learning algorithm?
Code:
iris = load_iris()
X = iris.data
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy}')
40
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')
41
13. perform sentiment analysis on a set of text data to understand the overall
sentiment
Code:
import nltk
nltk.download('vader_lexicon')
text_data = [
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
print(f"Text: {text}")
42
if sentiment_scores['compound'] >= 0.05:
sentiment = 'Positive'
sentiment = 'Negative'
else:
sentiment = 'Neutral'
43
14. Identify distinct customer segments based on their behaviour or
characteristics?
Code:
import pandas as pd
import numpy as np
# Generate sample data (replace this with your actual customer data)
np.random.seed(42)
data = pd.DataFrame({
})
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
data['Cluster'] = kmeans.fit_predict(scaled_data)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)
44
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=data['Cluster'], cmap='viridis')
plt.title('Customer Segmentation')
plt.show()
45
15. Optimize hyperparameters of a model to achieve better performance.
Code:
import numpy as np
import pandas as pd
np.random.seed(42)
X = np.random.rand(100, 5) # Features
svm_model = SVC()
param_grid = {
grid_search.fit(X_train, y_train)
46
# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
y_pred = grid_search.best_estimator_.predict(X_test)
Output:
47
16. Create informative and visually appealing plots to represent patterns and
relationships in the data.
Code:
import numpy as np
import pandas as pd
np.random.seed(42)
data_size = 100
feature1 = np.random.rand(data_size)
sns.set(style="whitegrid")
plt.show()
plt.figure(figsize=(8, 6))
plt.show()
48
# Box plot to visualize distributions
plt.figure(figsize=(8, 6))
plt.show()
# Violin plot to show the distribution of a numerical variable across different categories
plt.figure(figsize=(10, 8))
plt.show()
Output:
49
17. Explore techniques to handle missing data and outliers in a dataset.
Code:
import pandas as pd
import numpy as np
np.random.seed(42)
data = {
df = pd.DataFrame(data)
# Handling Outliers
z_scores = np.abs(stats.zscore(df['Feature3']))
50
# Visualizing the original and cleaned data
plt.figure(figsize=(12, 6))
plt.subplot(2, 2, 1)
sns.boxplot(x=df['Feature2'])
plt.subplot(2, 2, 2)
sns.boxplot(x=df_no_outliers['Feature2'])
plt.subplot(2, 2, 3)
sns.histplot(df['Feature3'], kde=True)
plt.subplot(2, 2, 4)
sns.histplot(df_no_outliers['Feature3'], kde=True)
plt.tight_layout()
plt.show()
Output:
51
18.
Code:
import nltk
import spacy
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
def extract_entities_nltk(text):
words = word_tokenize(text)
tagged_words = pos_tag(words)
named_entities = ne_chunk(tagged_words)
entities = []
if isinstance(entity, nltk.Tree):
return entities
def extract_entities_spacy(text):
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
return entities
# Sample text
text = "Apple Inc. is planning to open a new store in New York City."
entities_nltk = extract_entities_nltk(text)
52
print("Named Entities (NLTK):", entities_nltk)
entities_spacy = extract_entities_spacy(text)
Output:
19. Build a forecasting model for a time series dataset, and evaluate its
accuracy.
Code:
import pandas as pd
import numpy as np
np.random.seed(42)
# Create a DataFrame
53
train, test = df.iloc[:train_size], df.iloc[train_size:]
plt.figure(figsize=(12, 6))
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()
fit_model = model.fit()
predictions = fit_model.forecast(steps=len(test))
# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)
plt.figure(figsize=(12, 6))
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
54
plt.show()
Output:
55
20. Design and implement a time series forecasting model to predict future
temperatures. Evaluate the model's performance and visualize the predictions
against the actual temperatures.
Code:
import pandas as pd
import numpy as np
np.random.seed(42)
# Create a DataFrame
plt.figure(figsize=(12, 6))
plt.title('Hypothetical Daily Temperature Time Series Data - Training and Test Sets')
plt.xlabel('Date')
plt.ylabel('Temperature')
56
plt.legend()
plt.show()
fit_model = model.fit()
predictions = fit_model.forecast(steps=len(test))
# Evaluate the accuracy of the model using Root Mean Squared Error (RMSE)
plt.figure(figsize=(12, 6))
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.legend()
plt.show()
Output:
57
58