In [68]: #TASK 1
#import all required libraray import pandas as pd import math import numpy
as np from scipy import sparse from scipy.stats import uniform from
sklearn.feature_extraction.text import TfidfVectorizer
# input data string corpus = ['this is the first document',
'this document is the second document', 'and this is the third one', 'is
this the first document'] # use fit method to compute Bag of words
vectorizer = TfidfVectorizer() vectorizer.fit(corpus) skl_output =
vectorizer.transform(corpus) bow=vectorizer.get_feature_names() print(bow)
IDF_reference=vectorizer.idf_ print(IDF_reference) #compute IDF using
custom method
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() vectors = vectorizer.fit_transform(corpus)
matrix = CountVectorizer() matrix.fit(corpus) # after this statement the
matrix will build the vocabulary with all th e unique words
Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD
# you should call this function only after fit()
# to convert the sentance into numerical vectors, we will call transfor
m() # the first feature name will corresponds to first column in
transforme d matrix # the 2nd feature name will corresponds to 2nd column
in transformed ma trix
print(matrix.transform(corpus).toarray())
# Here we will print the sklearn tfidf vectorizer idf values after appl
ying the fit method # After using the fit function on the corpus the vocab
has 9 words in i t, and each has its idf value.
#compute IDF using custom method
for i in range(len(bow)):
Y=0 word=bow[i] for j in range(len(corpus)):
list[j]=corpus[j].split()
if(word in list[j]): #print(word) #print(list[j]) Y=Y+1 X=len(corpus)
XY=math.log((1+X)/(1+Y)) IDF_custom=XY+1 print(IDF_custom)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'th
is'] [1 91629073 1 22314355 1 51082562 1 1 91629073 1 91629073 Create PDF
in your applications with the Pdfcrowd HTML to PDF API PDFCROWD
[1.91629073 1.22314355 1.51082562 1. 1.91629073 1.91629073
1. 1.91629073 1. ] [[0 1 1 1 0 0 1 0 1] [0 2 0 1 0 1 1 0 1] [1 0 0 1 1 0 1
1 1] [0 1 1 1 0 0 1 0 1]] 1.916290731874155 1.2231435513142097
1.5108256237659907 1.0 1.916290731874155 1.916290731874155 1.0
1.916290731874155 1.0
In [15]: #TASK2
import pickle import numpy as np with
open("E:\Applied_AI\Assignments\cleaned_strings","rb") as f:
data = pickle.load(f) # printing the length of the corpus loaded
print("Number of documents in data = ",len(data))
#call all usique words using fit and tranform function from
sklearn.feature_extraction.text import TfidfVectorizer vectorizer =
TfidfVectorizer() vectorizer.fit(data) skl_output =
vectorizer.transform(data) bow=vectorizer.get_feature_names()
#compute IDF IDF=vectorizer.idf_
#sort IDF in descending order sorted_IDF=np.sort(IDF)
required_IDF=sorted_IDF[::-1]
#print top 50 IDF values
Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD
print(required_IDF[0:49])
Number of documents in data = 746 [6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918 6.922918 6.922918 6.922918
6.922918 6.922918 6.922918 6.92291 8 6.922918]
Create PDF in your applications with the Pdfcrowd HTML to PDF API PDFCROWD