Advanced SQL Interview Questions with Delta Lake
1. Write a SQL query using Delta tables to compute cumulative revenue per day
SELECT
order_date,
SUM(revenue) OVER (ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS
cumulative_revenue
FROM delta.`/mnt/delta/sales_data`
2. Retrieve the first and last login per user from a huge event log table
SELECT
user_id,
MIN(event_time) AS first_login,
MAX(event_time) AS last_login
FROM delta.`/mnt/delta/event_logs`
WHERE event_type = 'login'
GROUP BY user_id
3. Find the top 3 customers by spend in each region (use window functions)
SELECT *
FROM (
SELECT
region,
customer_id,
SUM(spend) AS total_spend,
RANK() OVER (PARTITION BY region ORDER BY SUM(spend) DESC) AS rank
FROM delta.`/mnt/delta/sales`
GROUP BY region, customer_id
) t
WHERE rank <= 3
4. Detect duplicate records in a transactional table and delete extras safely
WITH ranked AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY txn_id ORDER BY last_updated DESC) AS rn
FROM delta.`/mnt/delta/transactions`
)
DELETE FROM delta.`/mnt/delta/transactions` WHERE txn_id IN (
SELECT txn_id FROM ranked WHERE rn > 1
)
5. Get users who made purchases in 3 consecutive months
WITH monthly_purchase AS (
SELECT DISTINCT user_id, DATE_TRUNC('month', purchase_date) AS month
FROM delta.`/mnt/delta/purchases`
),
ranked AS (
Advanced SQL Interview Questions with Delta Lake
SELECT user_id, month,
DENSE_RANK() OVER (PARTITION BY user_id ORDER BY month) AS rnk
FROM monthly_purchase
),
streaks AS (
SELECT user_id, COUNT(*) as cnt
FROM (
SELECT user_id, rnk - MONTH(month) AS grp
FROM ranked
) g
GROUP BY user_id, grp
)
SELECT user_id FROM streaks WHERE cnt >= 3
6. Identify skewed joins in SQL and propose fixes using broadcast hints
-- Skewed Join Example
SELECT /*+ BROADCAST(d) */ s.*, d.region
FROM sales s
JOIN /* smaller dimension */ dim_region d ON s.region_id = d.id
-- Use BROADCAST for small dimension tables to avoid skew
7. Compute a 7-day moving average of product sales on Delta Lake
SELECT
product_id,
sales_date,
AVG(sales_qty) OVER (
PARTITION BY product_id
ORDER BY sales_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg
FROM delta.`/mnt/delta/product_sales`
8. Pivot daily sales into month-wise columns
SELECT
product_id,
SUM(CASE WHEN MONTH(sales_date) = 1 THEN sales_qty ELSE 0 END) AS Jan,
SUM(CASE WHEN MONTH(sales_date) = 2 THEN sales_qty ELSE 0 END) AS Feb,
...
FROM delta.`/mnt/delta/sales`
GROUP BY product_id
9. Find customers who bought products every month in a year
WITH monthly_activity AS (
SELECT customer_id, MONTH(purchase_date) AS month
FROM delta.`/mnt/delta/purchases`
WHERE YEAR(purchase_date) = 2024
Advanced SQL Interview Questions with Delta Lake
GROUP BY customer_id, month
)
SELECT customer_id
FROM monthly_activity
GROUP BY customer_id
HAVING COUNT(DISTINCT month) = 12
10. Rank products by sales per year, resetting rank each year
SELECT
product_id,
YEAR(sale_date) AS year,
SUM(units_sold) AS total,
RANK() OVER (PARTITION BY YEAR(sale_date) ORDER BY SUM(units_sold) DESC) AS rank
FROM delta.`/mnt/delta/sales`
GROUP BY product_id, YEAR(sale_date)
11. Find employees earning more than their department average
SELECT *
FROM employees e
JOIN (
SELECT department_id, AVG(salary) AS dept_avg
FROM employees
GROUP BY department_id
) d ON e.department_id = d.department_id
WHERE e.salary > d.dept_avg
12. Find the median transaction amount (no built-in median)
WITH ranked AS (
SELECT amount,
ROW_NUMBER() OVER (ORDER BY amount) AS rn,
COUNT(*) OVER () AS total
FROM delta.`/mnt/delta/transactions`
)
SELECT AVG(amount) AS median
FROM ranked
WHERE rn IN (FLOOR((total+1)/2.0), CEIL((total+1)/2.0))
13. Get all users who placed their first order in the last 30 days
WITH first_orders AS (
SELECT user_id, MIN(order_date) AS first_order
FROM delta.`/mnt/delta/orders`
GROUP BY user_id
)
SELECT * FROM first_orders
WHERE first_order >= current_date() - INTERVAL 30 DAYS
Advanced SQL Interview Questions with Delta Lake
14. Compare price change between two dates for each product
SELECT
p1.product_id,
p1.price AS price_start,
p2.price AS price_end,
(p2.price - p1.price) AS price_diff
FROM
delta.`/mnt/delta/prices` p1
JOIN
delta.`/mnt/delta/prices` p2
ON p1.product_id = p2.product_id
WHERE p1.date = '2024-01-01' AND p2.date = '2024-12-31'
15. Identify customers whose first and last transaction is on the same day
WITH txn_dates AS (
SELECT customer_id,
MIN(txn_date) AS first_txn,
MAX(txn_date) AS last_txn
FROM delta.`/mnt/delta/transactions`
GROUP BY customer_id
)
SELECT * FROM txn_dates
WHERE first_txn = last_txn
16. Calculate the percentage of returning users for each month
WITH first_seen AS (
SELECT user_id, MIN(DATE_TRUNC('month', login_date)) AS first_month
FROM delta.`/mnt/delta/logins`
GROUP BY user_id
),
monthly_users AS (
SELECT DISTINCT user_id, DATE_TRUNC('month', login_date) AS login_month
FROM delta.`/mnt/delta/logins`
)
SELECT
login_month,
COUNT(DISTINCT CASE WHEN first_month < login_month THEN user_id END) * 100.0 / COUNT(DISTINCT
user_id) AS returning_pct
FROM monthly_users
LEFT JOIN first_seen USING (user_id)
GROUP BY login_month
17. Retrieve products that have never been sold
SELECT p.product_id
FROM delta.`/mnt/delta/products` p
LEFT JOIN delta.`/mnt/delta/sales` s ON p.product_id = s.product_id
Advanced SQL Interview Questions with Delta Lake
WHERE s.product_id IS NULL
18. Detect schema drift in historical Delta data snapshots
DESCRIBE HISTORY delta.`/mnt/delta/your_table`
-- Look for changes in the 'operationMetrics' and 'operationParameters' columns
19. Find departments where at least 2 employees have identical salaries
SELECT department_id
FROM employees
GROUP BY department_id, salary
HAVING COUNT(*) >= 2
20. Group users by login streaks of 3+ consecutive days
WITH daily_logins AS (
SELECT DISTINCT user_id, login_date FROM delta.`/mnt/delta/logins`
),
ranked AS (
SELECT user_id, login_date,
DATEDIFF(login_date, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date)) AS grp
FROM daily_logins
),
streaks AS (
SELECT user_id, COUNT(*) AS streak_len
FROM ranked
GROUP BY user_id, grp
)
SELECT user_id FROM streaks WHERE streak_len >= 3