Data Science Coding Questions and Answers
1. Load CSV, clean data
import pandas as pd
df = pd.read_csv('data.csv')
df = df.drop_duplicates()
df = df.dropna()
df.columns = df.columns.str.lower().str.replace(' ', '_')
2. Pearson correlation from scratch
def pearson_corr(x, y):
mean_x = sum(x) / len(x)
mean_y = sum(y) / len(y)
num = sum((a - mean_x)*(b - mean_y) for a, b in zip(x, y))
den = (sum((a - mean_x)**2 for a in x) * sum((b - mean_y)**2 for b in y))**0.5
return num / den
3. DataFrame summary function
def summarize_df(df):
return {
'shape': df.shape,
'missing_values': df.isnull().sum().to_dict(),
'top_categoricals': {
col: df[col].value_counts().head(3).to_dict()
for col in df.select_dtypes(include='object')
}
}
4. Text feature engineering
df['num_chars'] = df['text'].str.len()
df['num_words'] = df['text'].str.split().apply(len)
df['avg_word_len'] = df['num_chars'] / df['num_words']
5. Binary column for top 10%
threshold = df['score'].quantile(0.9)
df['is_top_10_percent'] = df['score'] >= threshold
6. Datetime extraction
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['weekday'] = df['timestamp'].dt.day_name()
df['hour'] = df['timestamp'].dt.hour
df['month'] = df['timestamp'].dt.month
Data Science Coding Questions and Answers
7. Logistic regression metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
8. Correlation matrix
df.corr()
9. Flatten nested dictionary
def flatten_dict(d, parent_key='', sep='_'):
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
10. Timer decorator
import time
def timer(func):
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
print(f"{func.__name__} took {time.time() - start:.4f} seconds")
return result
return wrapper
11. Mini-batch generator
def batch_generator(data, batch_size):
for i in range(0, len(data), batch_size):
yield data[i:i+batch_size]
12. SQL: Find top 5 customers by purchase amount
SELECT customer_id, SUM(amount) AS total_spent
Data Science Coding Questions and Answers
FROM transactions
GROUP BY customer_id
ORDER BY total_spent DESC
LIMIT 5;
13. SQL: Count of users per country
SELECT country, COUNT(*) AS user_count
FROM users
GROUP BY country;
14. SQL: Users who made more than 3 purchases in last 30 days
SELECT user_id
FROM purchases
WHERE purchase_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY user_id
HAVING COUNT(*) > 3;
15. SQL: Revenue per category
SELECT category, SUM(price * quantity) AS revenue
FROM sales
GROUP BY category;