0% found this document useful (0 votes)
4 views5 pages

Delta SQL Analytics Queries

The document contains a series of SQL queries for Delta Lake SQL Analytics, covering various analytical tasks such as calculating cumulative revenue, detecting duplicates, and analyzing user behavior. It includes queries for generating moving averages, pivoting sales data, and identifying trends in user purchases and transactions. Additionally, it addresses performance optimizations and schema drift detection in Delta tables.

Uploaded by

adinarayana
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views5 pages

Delta SQL Analytics Queries

The document contains a series of SQL queries for Delta Lake SQL Analytics, covering various analytical tasks such as calculating cumulative revenue, detecting duplicates, and analyzing user behavior. It includes queries for generating moving averages, pivoting sales data, and identifying trends in user purchases and transactions. Additionally, it addresses performance optimizations and schema drift detection in Delta tables.

Uploaded by

adinarayana
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

Delta Lake SQL Analytics Queries

-- 1. Cumulative revenue per day


SELECT
order_date,
SUM(revenue) OVER (ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT
ROW) AS cumulative_revenue
FROM sales_table;

-- 2. First and last login per user


SELECT
user_id,
MIN(login_time) AS first_login,
MAX(login_time) AS last_login
FROM event_logs
GROUP BY user_id;

-- 3. Top 3 customers by spend in each region


SELECT *
FROM (
SELECT *,
RANK() OVER (PARTITION BY region ORDER BY spend DESC) AS rank
FROM customer_spend
) ranked
WHERE rank <= 3;

-- 4. Detect and delete duplicate records


-- Detect duplicates
SELECT *, COUNT(*) OVER (PARTITION BY txn_id, txn_time, amount) AS dup_count
FROM transactions
HAVING dup_count > 1;

-- Delete extras (keep one)


DELETE FROM transactions
WHERE row_id NOT IN (
SELECT MIN(row_id)
FROM transactions
GROUP BY txn_id, txn_time, amount
);

-- 5. Users who made purchases in 3 consecutive months


WITH monthly AS (
SELECT user_id,
DATE_TRUNC('month', purchase_date) AS month
FROM purchases
GROUP BY user_id, DATE_TRUNC('month', purchase_date)
),
consecutive AS (
SELECT user_id, month,
LAG(month, 1) OVER (PARTITION BY user_id ORDER BY month) AS prev1,
LAG(month, 2) OVER (PARTITION BY user_id ORDER BY month) AS prev2
FROM monthly
)
SELECT DISTINCT user_id
FROM consecutive
WHERE DATEDIFF(month, prev1, month) = 1
AND DATEDIFF(month, prev2, prev1) = 1;

-- 6. Skewed joins and broadcast fix


-- Without fix
SELECT * FROM large_fact f
JOIN small_dim d ON f.dim_id = d.id;

-- With broadcast hint


SELECT /*+ BROADCAST(d) */ * FROM large_fact f
JOIN small_dim d ON f.dim_id = d.id;

-- 7. 7-day moving average of product sales


SELECT
product_id,
sale_date,
AVG(sales) OVER (
PARTITION BY product_id
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7d
FROM product_sales;

-- 8. Pivot daily sales into month-wise columns


SELECT product_id,
SUM(CASE WHEN MONTH(sale_date) = 1 THEN sales ELSE 0 END) AS jan_sales,
SUM(CASE WHEN MONTH(sale_date) = 2 THEN sales ELSE 0 END) AS feb_sales,
SUM(CASE WHEN MONTH(sale_date) = 3 THEN sales ELSE 0 END) AS mar_sales,
SUM(CASE WHEN MONTH(sale_date) = 4 THEN sales ELSE 0 END) AS apr_sales,
SUM(CASE WHEN MONTH(sale_date) = 5 THEN sales ELSE 0 END) AS may_sales,
SUM(CASE WHEN MONTH(sale_date) = 6 THEN sales ELSE 0 END) AS jun_sales,
SUM(CASE WHEN MONTH(sale_date) = 7 THEN sales ELSE 0 END) AS jul_sales,
SUM(CASE WHEN MONTH(sale_date) = 8 THEN sales ELSE 0 END) AS aug_sales,
SUM(CASE WHEN MONTH(sale_date) = 9 THEN sales ELSE 0 END) AS sep_sales,
SUM(CASE WHEN MONTH(sale_date) = 10 THEN sales ELSE 0 END) AS oct_sales,
SUM(CASE WHEN MONTH(sale_date) = 11 THEN sales ELSE 0 END) AS nov_sales,
SUM(CASE WHEN MONTH(sale_date) = 12 THEN sales ELSE 0 END) AS dec_sales
FROM sales
GROUP BY product_id;

-- 9. Customers who bought products every month in a year


WITH monthly_purchase AS (
SELECT customer_id, MONTH(purchase_date) AS month
FROM purchases
WHERE YEAR(purchase_date) = 2024
GROUP BY customer_id, MONTH(purchase_date)
)
SELECT customer_id
FROM monthly_purchase
GROUP BY customer_id
HAVING COUNT(DISTINCT month) = 12;
-- 10. Rank products by sales per year
SELECT *,
RANK() OVER (PARTITION BY YEAR(sale_date) ORDER BY total_sales DESC) AS year_rank
FROM (
SELECT product_id,
YEAR(sale_date) AS year,
SUM(sales) AS total_sales
FROM sales
GROUP BY product_id, YEAR(sale_date)
);

-- 11. Employees earning more than their department average


SELECT *
FROM employees e
JOIN (
SELECT department_id, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id
) dept_avg
ON e.department_id = dept_avg.department_id
WHERE e.salary > dept_avg.avg_salary;

-- 12. Median transaction amount


WITH ranked AS (
SELECT amount,
ROW_NUMBER() OVER (ORDER BY amount) AS rn,
COUNT(*) OVER () AS cnt
FROM transactions
)
SELECT AVG(amount) AS median
FROM ranked
WHERE rn IN (FLOOR((cnt + 1)/2.0), CEIL((cnt + 1)/2.0));

-- 13. Users who placed first order in last 30 days


WITH first_order AS (
SELECT user_id, MIN(order_date) AS first_order_date
FROM orders
GROUP BY user_id
)
SELECT user_id
FROM first_order
WHERE first_order_date >= DATE_SUB(CURRENT_DATE(), 30);

-- 14. Compare price change between two dates


SELECT product_id,
MAX(CASE WHEN price_date = '2025-01-01' THEN price END) AS price_on_start,
MAX(CASE WHEN price_date = '2025-07-01' THEN price END) AS price_on_end,
MAX(CASE WHEN price_date = '2025-07-01' THEN price END)
- MAX(CASE WHEN price_date = '2025-01-01' THEN price END) AS price_diff
FROM product_prices
WHERE price_date IN ('2025-01-01', '2025-07-01')
GROUP BY product_id;

-- 15. Customers with first and last transaction on same day


WITH user_txns AS (
SELECT user_id,
MIN(txn_date) AS first_txn,
MAX(txn_date) AS last_txn
FROM transactions
GROUP BY user_id
)
SELECT user_id
FROM user_txns
WHERE first_txn = last_txn;

-- 16. Percentage of returning users per month


WITH firsts AS (
SELECT user_id, MIN(DATE_TRUNC('month', order_date)) AS first_month
FROM orders
GROUP BY user_id
),
monthly_users AS (
SELECT DISTINCT user_id, DATE_TRUNC('month', order_date) AS month
FROM orders
)
SELECT month,
COUNT(CASE WHEN first_month <> month THEN user_id END) * 100.0 / COUNT(user_id)
AS returning_user_pct
FROM monthly_users
LEFT JOIN firsts USING (user_id)
GROUP BY month;

-- 17. Products never sold


SELECT p.product_id
FROM products p
LEFT JOIN sales s ON p.product_id = s.product_id
WHERE s.product_id IS NULL;

-- 18. Detect schema drift in Delta


DESCRIBE HISTORY delta.`/path/to/delta_table`;

-- 19. Departments where >=2 employees have identical salaries


SELECT department_id
FROM employees
GROUP BY department_id, salary
HAVING COUNT(*) >= 2;

-- 20. Group users by login streaks of 3+ consecutive days


WITH ranked AS (
SELECT user_id,
login_date,
DATEDIFF(login_date, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY
login_date)) AS grp
FROM logins
),
grouped AS (
SELECT user_id, grp, COUNT(*) AS streak_len
FROM ranked
GROUP BY user_id, grp
)
SELECT user_id, streak_len
FROM grouped
WHERE streak_len >= 3;

You might also like