Pandas syntax revision for ML
Data Loading and Initial Exploration
Reading Data
# Basic data loading
df = pd.read_csv('file.csv')
df = pd.read_csv('file.csv', low_memory=False) # Prevents mixed data types
df = pd.read_csv('file.csv', index_col=0) # Set first column as index
df = pd.read_csv('file.csv', parse_dates=['date_col']) # Parse dates automatically
# Loading with specific parameters
df = pd.read_csv('file.csv',
sep=';', # Different separator
encoding='utf-8', # Handle special characters
nrows=1000) # Load only first 1000 rows
Basic Data Inspection
# Shape and basic info
df.shape # Returns (rows, columns)
df.info() # Data types, memory usage, non-null counts
df.describe() # Statistical summary for numerical columns
df.describe(include='all') # Summary for all columns including categorical
# Column information
df.columns # Column names
df.dtypes # Data types of each column
df.index # Index information
df.head(10) # First 10 rows
df.tail(5) # Last 5 rows
df.sample(3) # Random 3 rows
Data Selection and Indexing
Column Selection
# Single column
df['column_name'] # Returns Series
df[['column_name']] # Returns DataFrame
# Multiple columns
df[['col1', 'col2', 'col3']]
# Column selection by condition
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns
df[numeric_cols] # Select only numeric columns
Row Selection
# By index position
df.iloc[^0] # First row
df.iloc[0:5] # First 5 rows
df.iloc[:, 0:3] # All rows, first 3 columns
df.iloc[0:5, 0:3] # First 5 rows, first 3 columns
# By label
df.loc[^0] # Row with index 0
df.loc[0:4] # Rows with index 0 to 4 (inclusive)
df.loc[:, 'col1':'col3'] # All rows, columns from col1 to col3
df.loc[df['column'] > 5] # Conditional selection
Boolean Indexing
# Single condition
df[df['age'] > 25]
df[df['category'] == 'A']
df[df['name'].str.contains('John')]
# Multiple conditions
df[(df['age'] > 25) & (df['salary'] < 50000)]
df[(df['category'] == 'A') | (df['category'] == 'B')]
df[df['column'].isin(['value1', 'value2', 'value3'])]
# Negation
df[~df['column'].isin(['unwanted_value'])]
df[df['column'] != 'unwanted_value']
Data Cleaning and Preprocessing
Handling Missing Values
# Detecting missing values
df.isnull().sum() # Count of missing values per column
df.isna().sum() # Same as isnull()
df.isnull().any() # Boolean - any missing values per column
df.info() # Shows non-null counts
# Removing missing values
df.dropna() # Drop rows with any missing values
df.dropna(subset=['col1']) # Drop rows where col1 is missing
df.dropna(axis=1) # Drop columns with any missing values
df.dropna(thresh=2) # Keep rows with at least 2 non-null values
# Filling missing values
df.fillna(0) # Fill with 0
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
df['column'].fillna(df['column'].mean()) # Fill with mean
df['column'].fillna(df['column'].mode()[^0]) # Fill with mode
Data Type Conversion
# Converting data types
df['column'] = df['column'].astype('int64')
df['column'] = df['column'].astype('float64')
df['column'] = df['column'].astype('category')
df['date_col'] = pd.to_datetime(df['date_col'])
# Converting multiple columns
cols_to_convert = ['col1', 'col2', 'col3']
df[cols_to_convert] = df[cols_to_convert].astype('float64')
Duplicate Handling
# Finding duplicates
df.duplicated().sum() # Count of duplicate rows
df.duplicated(subset=['col1']).sum() # Duplicates based on specific column
# Removing duplicates
df.drop_duplicates() # Remove duplicate rows
df.drop_duplicates(subset=['col1']) # Remove duplicates based on column
df.drop_duplicates(keep='last') # Keep last occurrence
Data Transformation
Creating New Columns
# Simple operations
df['new_col'] = df['col1'] + df['col2']
df['ratio'] = df['numerator'] / df['denominator']
df['log_price'] = np.log(df['price'])
# Conditional column creation
df['category'] = df['score'].apply(lambda x: 'High' if x > 80 else 'Low')
df['grade'] = np.where(df['score'] > 90, 'A',
np.where(df['score'] > 80, 'B', 'C'))
# Using map for categorical mapping
mapping = {'small': 1, 'medium': 2, 'large': 3}
df['size_numeric'] = df['size'].map(mapping)
String Operations
# Basic string operations
df['column'].str.lower() # Convert to lowercase
df['column'].str.upper() # Convert to uppercase
df['column'].str.strip() # Remove leading/trailing whitespace
df['column'].str.replace('old', 'new') # Replace text
# String splitting and extraction
df['column'].str.split('_') # Split by underscore
df['column'].str.split('_', expand=True) # Split into separate columns
df['column'].str[^0] # First character
df['column'].str[:3] # First 3 characters
df['column'].str.extract('(\d+)') # Extract numbers using regex
# String conditions
df['column'].str.contains('pattern')
df['column'].str.startswith('prefix')
df['column'].str.endswith('suffix')
df['column'].str.len() # Length of strings
Date and Time Operations
# Converting to datetime
df['date'] = pd.to_datetime(df['date'])
# Extracting date components (like in your notebook)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['is_weekend'] = df['date'].dt.dayofweek >= 5
# Date arithmetic
df['days_ago'] = (pd.Timestamp.now() - df['date']).dt.days
df['date_plus_30'] = df['date'] + pd.Timedelta(days=30)
Grouping and Aggregation
Basic GroupBy Operations
# Single column grouping
df.groupby('category').mean()
df.groupby('category').sum()
df.groupby('category').count()
df.groupby('category').size() # Count including NaN
# Multiple column grouping
df.groupby(['category', 'subcategory']).mean()
# Specific column aggregation
df.groupby('category')['price'].mean()
df.groupby('category')['price'].agg(['mean', 'std', 'count'])
Advanced Aggregation
# Multiple aggregations
df.groupby('category').agg({
'price': ['mean', 'std'],
'quantity': 'sum',
'date': 'count'
})
# Custom aggregation functions
df.groupby('category')['price'].agg(lambda x: x.max() - x.min())
# Apply custom functions
def custom_function(group):
return group['price'].mean() / group['quantity'].sum()
df.groupby('category').apply(custom_function)
Transform and Filter
# Transform (returns same shape as original)
df['price_normalized'] = df.groupby('category')['price'].transform('mean')
df['price_zscore'] = df.groupby('category')['price'].transform(lambda x: (x - x.mean()) / x.std())
# Filter groups
df.groupby('category').filter(lambda x: len(x) > 10) # Groups with >10 records
Merging and Joining
Concatenation
# Vertical concatenation (stacking rows)
combined = pd.concat([df1, df2], axis=0, ignore_index=True)
# Horizontal concatenation (side by side)
combined = pd.concat([df1, df2], axis=1)
# With keys for identification
combined = pd.concat([df1, df2], keys=['train', 'test'])
Merging DataFrames
# Inner join (only matching records)
merged = pd.merge(df1, df2, on='key_column', how='inner')
# Left join (all records from left DataFrame)
merged = pd.merge(df1, df2, on='key_column', how='left')
# Outer join (all records from both DataFrames)
merged = pd.merge(df1, df2, on='key_column', how='outer')
# Multiple key columns
merged = pd.merge(df1, df2, on=['key1', 'key2'])
# Different column names
merged = pd.merge(df1, df2, left_on='col1', right_on='col2')
Reshaping Data
Pivot Operations
# Pivot table
pivot = df.pivot_table(values='sales',
index='date',
columns='product',
aggfunc='sum')
# Simple pivot
pivot = df.pivot(index='date', columns='product', values='sales')
# Melt (wide to long format)
melted = df.melt(id_vars=['id', 'date'],
value_vars=['sales1', 'sales2'],
var_name='product',
value_name='sales')
Stack and Unstack
# Stack (columns to rows)
stacked = df.stack()
# Unstack (rows to columns)
unstacked = df.unstack()
# Multi-level operations
df.unstack(level=0)
df.unstack(level='column_name')
Statistical Operations
Descriptive Statistics
# Basic statistics
df['column'].mean()
df['column'].median()
df['column'].std()
df['column'].var()
df['column'].min()
df['column'].max()
df['column'].quantile(0.25) # 25th percentile
# Multiple statistics at once
df['column'].describe()
# Correlation
df.corr() # Correlation matrix
df['col1'].corr(df['col2']) # Correlation between two columns
Value Counts and Frequencies
# Count unique values
df['column'].value_counts()
df['column'].value_counts(normalize=True) # As percentages
df['column'].value_counts(dropna=False) # Include NaN counts
# Unique values
df['column'].unique()
df['column'].nunique() # Number of unique values
Advanced Techniques for Kaggle
Feature Engineering Patterns
# Binning continuous variables
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 50, 75, 100],
labels=['Young', 'Adult', 'Middle', 'Senior'])
# Quantile-based binning
df['price_quartile'] = pd.qcut(df['price'], q=4, labels=['Low', 'Med-Low', 'Med-High', 'High'])
# Rolling statistics (time series)
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df['rolling_std'] = df['value'].rolling(window=7).std()
# Lag features
df['value_lag1'] = df['value'].shift(1)
df['value_lag7'] = df['value'].shift(7)
Memory Optimization
# Reduce memory usage
def reduce_mem_usage(df):
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
return df
# Check memory usage
df.memory_usage(deep=True).sum()
Efficient Data Processing
# Using query for complex filtering (faster than boolean indexing)
df.query('age > 25 and salary < 50000')
df.query('category in ["A", "B", "C"]')
# Using eval for complex calculations
df.eval('new_col = col1 * col2 + col3')
# Categorical data for memory efficiency
df['category'] = df['category'].astype('category')
Common Kaggle Competition Patterns
Train-Test Split Preparation
# Combining train and test for consistent preprocessing (like in your notebook)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# Combine for preprocessing
combined = pd.concat([train_df, test_df], axis=0, ignore_index=True)
# After preprocessing, split back
train_processed = combined.iloc[:len(train_df), :].reset_index(drop=True)
test_processed = combined.iloc[len(train_df):, :].reset_index(drop=True)
Target Encoding and Feature Engineering
# Target encoding (mean encoding)
target_mean = train_df.groupby('category')['target'].mean()
train_df['category_encoded'] = train_df['category'].map(target_mean)
# Frequency encoding
freq_encoding = train_df['category'].value_counts()
train_df['category_freq'] = train_df['category'].map(freq_encoding)
# Interaction features
train_df['feature_interaction'] = train_df['feature1'] * train_df['feature2']
These pandas operations form the backbone of most data science workflows and Kaggle
competitions.