Exp9 - Apriori - Ipynb - Colaboratory
Exp9 - Apriori - Ipynb - Colaboratory
Roll No:312027
CLass: TE Experiment no: 9 Aim: To implement Apriori Algorithm
import pandas as pd
uploaded = files.upload()
import pandas as pd
import io
df = pd.read_csv(io.BytesIO(uploaded['dataset_apriori.csv']))
print(df)
Choose Files No file chosen Upload widget is only available when the cell has been executed in the
current browser session. Please rerun this cell to enable.
Saving dataset_apriori.csv to dataset_apriori.csv
tid items
0 1 Handphone,Laptop
1 2 Handphone,Charger,Laptop
2 3 Powerbank,Laptop,Charger,Handphone
3 4 Tablet,Laptop,Handphone
4 5 Handphone,Charger,Tablet
5 6 Tablet,Powerbank
6 7 Handphone,Laptop,Tablet,Charger
7 8 Charger,Handphone
8 9 Handphone,Powerbank
9 10 Laptop,Charger,Powerbank
dataset = pd.read_csv('dataset_apriori.csv')
dataset
tid items
0 = 1
df_items dataset['items'] Handphone,Laptop
df_tid = dataset['tid']
1 2 Handphone,Charger,Laptop
2 3 Powerbank,Laptop,Charger,Handphone
df_items
3 4 Tablet,Laptop,Handphone
0 Handphone,Laptop
14 5 Handphone,Charger,Tablet
Handphone,Charger,Laptop
2 Powerbank,Laptop,Charger,Handphone
35 6 Tablet,Powerbank
Tablet,Laptop,Handphone
4 Handphone,Charger,Tablet
6 7 Handphone,Laptop,Tablet,Charger
5 Tablet,Powerbank
67 8 Handphone,Laptop,Tablet,Charger
Charger,Handphone
7 Charger,Handphone
88 9 Handphone,Powerbank
Handphone,Powerbank
9 Laptop,Charger,Powerbank
9
Name:10items, dtype:Laptop,Charger,Powerbank
object
df_tid
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
Name: tid, dtype: int64
type(df_items)
pandas.core.series.Series
comma_splitted_df
0 [Handphone, Laptop]
1 [Handphone, Charger, Laptop]
2 [Powerbank, Laptop, Charger, Handphone]
3 [Tablet, Laptop, Handphone]
4 [Handphone, Charger, Tablet]
5 [Tablet, Powerbank]
6 [Handphone, Laptop, Tablet, Charger]
7 [Charger, Handphone]
8 [Handphone, Powerbank]
9 [Laptop, Charger, Powerbank]
Name: items, dtype: object
numbered_col = []
for i in range(len(comma_splitted_df)):
list_numbered = list(map(lambda x: dictionaries[x], comma_splitted_df[i]))
sort_numbered = sorted(list_numbered)
numbered_col.append(sort_numbered)
numbered_col
[[1, 2],
[1, 2, 3],
[1, 2, 3, 4],
[1, 2, 5],
[1, 3, 5],
[4, 5],
[1, 2, 3, 5],
[1, 3],
[1, 4],
[2, 3, 4]]
df
items
0 [1, 2]
1 [1, 2, 3]
2 [1, 2, 3, 4]
3 [1, 2, 5]
4 [1, 3, 5]
5 [4, 5]
6 [1, 2, 3, 5]
7 [1, 3]
8 [1, 4]
9 [2, 3, 4]
0 1 [1, 2]
1 2 [1, 2, 3]
2 3 [1, 2, 3, 4]
3 4 [1, 2, 5]
4 5 [1, 3, 5]
5 6 [4, 5]
6 7 [1, 2, 3, 5]
items = []
for i 7 8 [1, 3]
in range(len(df)):
for
8 j in
9 range(len(df['items'][i])):
[1, 4]
items.append(df['items'][i][j])
items 9 10 [2, 3, 4]
[1,
2,
1,
2,
3,
1,
2,
3,
4,
1,
2,
5,
1,
3,
5,
4,
5,
1,
2,
3,
5,
1,
3,
1,
4,
2,
3,
4]
{1, 2, 3, 4, 5}
#Convert it to list
list_unique_item = list(unique_item)
list_unique_item
[1, 2, 3, 4, 5]
count_unique = []
for value in (list_unique_item):
count_unique.append((value, items.count(value)))
count_unique
[(1, 8), (2, 6), (3, 6), (4, 4), (5, 4)]
candidate1_df
itemset sup
0 1 8
1 2 6
2 3 6
3 4 4
4 5 4
def filter_sup(candidate):
minimum_sup = 2
filtering = candidate['sup'] > minimum_sup
freq = candidate[filtering]
return freq
freq_itemset1
itemset sup
0 1 8
1 2 6
2 3 6
3 4 4
4 5 4
import numpy
def self_join(prev_freq_itemset):
self_join_candidate = []
for i in range(len(prev_freq_itemset['itemset'])):
for j in range((i+1), len(prev_freq_itemset['itemset'])):
itemset_i = prev_freq_itemset['itemset'][i]
itemset_j = prev_freq_itemset['itemset'][j]
if(type(itemset_i) == numpy.int64 and type(itemset_j) == numpy.int64):
itemset_i = {itemset_i}
itemset_j = {itemset_j}
union_candidate = itemset_i.union(itemset_j)
candidate2_list = self_join(freq_itemset1)
candidate2_list
[{1, 2},
{1, 3},
{1, 4},
{1, 5},
{2, 3},
{2, 4},
{2, 5},
{3, 4},
{3, 5},
{4, 5}]
count_candidate2 = []
count_candidate2
initial_df_candidate
itemset sup
0 {1, 2} 0
1 {1, 3} 0
2 {1, 4} 0
3 {1, 5} 0
4 {2, 3} 0
5 {2, 4} 0
6 {2, 5} 0
7 {3, 4} 0
8 {3, 5} 0
9 {4, 5} 0
items
0 [1, 2]
1 [1, 2, 3]
2 [1, 2, 3, 4]
3 [1, 2, 5]
4 [1, 3, 5]
5 [4, 5]
6 [1, 2, 3, 5]
7 [1, 3]
8 [1, 4]
9 [2, 3, 4]
#Let's add it with 1 whenever we found every candidate is a subset from Database D
return df_candidate
Database D dataframe
items
0 [1, 2]
1 [1, 2, 3]
2 [1, 2, 3, 4]
3 [1, 2, 5]
4 [1, 3, 5]
5 [4, 5]
6 [1, 2, 3, 5]
7 [1, 3]
8 [1, 4]
9 [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
itemset sup
0 {1, 2} 0
1 {1, 3} 0
2 {1, 4} 0
3 {1, 5} 0
4 {2, 3} 0
5 {2, 4} 0
6 {2, 5} 0
7 {3, 4} 0
8 {3, 5} 0
9 {4, 5} 0
count_candidate2_df
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 4} 2
3 {1, 5} 3
4 {2, 3} 4
5 {2, 4} 2
6 {2, 5} 2
7 {3, 4} 2
8 {3, 5} 2
9 {4, 5} 1
freq_itemset2
itemset sup
0 {1, 2} 5
1 {1, 3} 5
3 {1, 5} 3
4 {2, 3} 4
freq_itemset2_reset = freq_itemset2.reset_index(drop=True)
#We need to reset the index, because need to access the index later.
freq_itemset2_reset
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 5} 3
3 {2, 3} 4
Self Join
print(freq_itemset2_reset)
self_join_result = self_join(freq_itemset2_reset)
print('self join result')
print(self_join_result)
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 5} 3
3 {2, 3} 4
self join result
[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}]
Pruning
def get_subset(candidate):
temp = []
final = []
for i in range(len(candidate)):
for j in range(len(candidate)):
if i != j:
temp.append(candidate[j])
temp_set = set(temp)
final.append(temp_set)
temp.clear()
print('Subset from {} : {}'.format(candidate, final))
return final
if any(check) == False:
print(any(check))
print('Val', value)
else:
print('\nAll of {} subset contained in \n{}'.format(candidate_set, prev_freq_itemset
if value not in temp:
temp.append(value)
return temp
freq_itemset2_reset
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 5} 3
3 {2, 3} 4
for i in range(len(self_join_result)):
get_subset(list(self_join_result[i]))
freq_itemset2_reset
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 5} 3
3 {2, 3} 4
{2, 3}
Check True
{1, 3}
Check True
{1, 2}
Check True
self_join_result
All of [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}] subset contained in
itemset sup
0 {1, 2} 5
1 {1, 3} 5
2 {1, 5} 3
3 {2, 3} 4
Subset from [1, 2, 3, 5] : [{2, 3, 5}, {1, 3, 5}, {1, 2, 5}, {1, 2, 3}]
Temp item {2, 3, 5}
candidate3_list
0 [1, 2]
1 [1, 2, 3]
2 [1, 2, 3, 4]
3 [1, 2, 5]
4 [1, 3, 5]
5 [4, 5]
#Then 6 [1, 2,
check 3, 5]
the newest candidate value
candidate3_list
7 [1, 3]
[{1,
8 2, [1,
3},4]{1, 2, 5}, {1, 3, 5}]
9 [2, 3, 4]
count_candidate3_df = count_support(df, candidate3_list)
Database D dataframe
items
0 [1, 2]
1 [1, 2, 3]
2 [1, 2, 3, 4]
3 [1, 2, 5]
4 [1, 3, 5]
5 [4, 5]
6 [1, 2, 3, 5]
7 [1, 3]
8 [1, 4]
9 [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
itemset sup
0 {1, 2, 3} 0
1 {1, 2, 5} 0
2 {1, 3, 5} 0
count_candidate3_df
itemset sup
0 {1, 2, 3} 3
1 {1, 2, 5} 2
2 {1, 3, 5} 2
freq_itemset3 = filter_sup(count_candidate3_df)
freq_itemset3
itemset sup
0 {1, 2, 3} 3
All Frequent Itemset
#Let'see each frequent itemset (L)
freq_itemset1
itemset sup
0 1 8
1 2 6
2 3 6
3 4 4
4 5 4
freq_itemset2
itemset sup
0 {1, 2} 5
1 {1, 3} 5
3 {1, 5} 3
4 {2, 3} 4
freq_itemset3
itemset sup
0 {1, 2, 3} 3
frequent_itemset
itemset sup
0 1 8
1 2 6
2 3 6
3 4 4
4 5 4
0 {1, 2} 5
1 {1, 3} 5
3 {1, 5} 3
4 {2, 3} 4
0 {1, 2, 3} 3
#Reset the index
frequent_itemset_final = frequent_itemset.reset_index(drop=True)
frequent_itemset_final
itemset sup
0 1 8
1 2 6
2 3 6
3 4 4
4 5 4
5 {1, 2} 5
6 {1, 3} 5
7 {1, 5} 3
8 {2, 3} 4
9 {1, 2, 3} 3
Conclusion : Thus, we have successfully learned and implemented Apriori Algorithm using Python from
Scratch.