Skip to content

Commit ad88d8e

Browse files
Final Code
1 parent 5ef04ae commit ad88d8e

File tree

11 files changed

+701
-0
lines changed

11 files changed

+701
-0
lines changed

.idea/.gitignore

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/CMPE257_finalProject.iml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
import pandas as pd
3+
import joblib
4+
from datetime import datetime
5+
from tqdm import tqdm
6+
from collections import defaultdict, Counter
7+
import math
8+
import numpy as np
9+
import random
10+
import copy
11+
import gc
12+
from gensim.models import Word2Vec
13+
from sklearn.neighbors import NearestNeighbors
14+
15+
def load_datasets_and_mappings():
16+
"""Load training data and id-to-type mappings."""
17+
training_data = pd.read_parquet('../input/otto-full-optimized-memory-footprint/train.parquet')
18+
id_to_type_mapping = joblib.load('../input/otto-full-optimized-memory-footprint/id2type.pkl')
19+
type_to_id_mapping = joblib.load('../input/otto-full-optimized-memory-footprint/type2id.pkl')
20+
21+
return training_data, id_to_type_mapping, type_to_id_mapping
22+
23+
24+
def preprocess_training_data(training_data, config):
25+
"""Preprocess the training data."""
26+
training_data['aid'] = training_data['aid'].astype('int32').astype('str')
27+
28+
# Randomly sample sessions for training
29+
sampled_sessions = random.sample(list(training_data['session'].unique()), config['train_session_num'])
30+
training_data = training_data.query('session in @sampled_sessions').reset_index(drop=True)
31+
32+
training_data['time_stamp'] = pd.to_datetime(training_data['ts'], unit='s').dt.strftime('%Y-%m-%d')
33+
34+
return training_data
35+
36+
37+
def generate_word2vec_embeddings(data):
38+
"""Generate Word2Vec embeddings for session sequences."""
39+
session_sequences = data.groupby('session')['aid'].apply(list).tolist()
40+
41+
# Train Word2Vec model
42+
model = Word2Vec(session_sequences, min_count=1, sg=1)
43+
word_vectors = model.wv
44+
45+
return word_vectors
46+
47+
48+
def recommend_items(session_items, word_vectors, nearest_neighbors, popular_items):
49+
"""Recommend items based on the given session items using Word2Vec and nearest neighbors."""
50+
item_embeddings = []
51+
for item in session_items:
52+
if item in word_vectors:
53+
item_embeddings.append(word_vectors[item])
54+
55+
if len(item_embeddings) > 0:
56+
session_embedding = np.mean(item_embeddings, axis=0)
57+
_, indices = nearest_neighbors.kneighbors([session_embedding])
58+
similar_items = nearest_neighbors._fit_X[indices.flatten()]
59+
recommended_items = [item for item in similar_items[0] if item not in session_items]
60+
recommended_items = recommended_items[:20] # Limit to 20 recommendations
61+
else:
62+
recommended_items = []
63+
64+
if len(recommended_items) < 20:
65+
return recommended_items + popular_items[:20 - len(recommended_items)]
66+
else:
67+
return recommended_items
68+
69+
70+
def load_and_preprocess_test_data():
71+
"""Load and preprocess test data."""
72+
test_data = pd.read_parquet('../input/otto-full-optimized-memory-footprint/test.parquet')
73+
test_data['aid'] = test_data['aid'].astype('int32').astype('str')
74+
test_data['time_stamp'] = pd.to_datetime(test_data['ts'], unit='s').dt.strftime('%Y-%m-%d')
75+
test_data = test_data.sort_values(["session", "type", "ts"])
76+
session_to_item_ids = test_data.groupby('session')['aid'].agg(list).to_dict()
77+
78+
return session_to_item_ids
79+
80+
81+
def generate_recommendations(session_to_item_ids, word_vectors, nearest_neighbors, popular_items):
82+
"""Generate item recommendations for each session."""
83+
session_ids = []
84+
recommended_item_lists = []
85+
for session_id, session_items in tqdm(session_to_item_ids.items()):
86+
recommended_items = recommend_items(session_items, word_vectors, nearest_neighbors, popular_items)
87+
session_ids.append(session_id)
88+
recommended_item_lists.append(recommended_items)
89+
90+
return session_ids, recommended_item_lists
91+
92+
93+
def create_submission_file(session_ids, recommended_item_lists, id_to_type_mapping):
94+
"""Create a submission file with the recommended items for each session type."""
95+
submission_df = pd.DataFrame()
96+
submission_df['session_type'] = session_ids
97+
submission_df['labels'] = [' '.join([str(item) for item in item_list]) for item_list in recommended_item_lists]
98+
99+
submission_list = []
100+
for type_ in [0, 1, 2]:
101+
type_specific_df = submission_df.copy()
102+
type_specific_df['session_type'] = type_specific_df['session_type'].apply(lambda x: f'{x}_{id_to_type_mapping[type_]}')
103+
submission_list.append(type_specific_df)
104+
submission_df = pd.concat(submission_list, axis=0)
105+
106+
submission_df.to_csv('submission.csv', index=False)
107+
108+
109+
def main():
110+
config = {'train_session_num': 12899779}
111+
training_data, id_to_type_mapping, _ = load_datasets_and_mappings()
112+
training_data = preprocess_training_data(training_data, config)
113+
word_vectors = generate_word2vec_embeddings(training_data)
114+
session_sequences = training_data.groupby('session')['aid'].apply(list).tolist()
115+
nearest_neighbors = NearestNeighbors(metric='cosine')
116+
nearest_neighbors.fit(word_vectors[session_sequences])
117+
del training_data, session_sequences
118+
gc.collect()
119+
120+
session_to_item_ids = load_and_preprocess_test_data()
121+
popular_items = list(training_data['aid'].value_counts().index)
122+
session_ids, recommended_item_lists = generate_recommendations(session_to_item_ids, word_vectors, nearest_neighbors, popular_items)
123+
create_submission_file(session_ids, recommended_item_lists, id_to_type_mapping)
124+
125+
126+
if __name__ == "__main__":
127+
main()

Algortihms/popularity-based-rec-sys/otto-popularity-based.ipynb

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)