0% found this document useful (0 votes)
1 views

codefp1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views

codefp1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

*1

from google.colab import files

# Upload kaggle.json

files.upload()

# Move kaggle.json to the proper directory

!mkdir -p ~/.kaggle

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

*2

# Download dataset

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

# Unzip dataset

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

*3

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

*4

import os

from PIL import Image

import librosa

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file

audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

*5

import os

from PIL import Image

import librosa

from IPython.display import display, Audio

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

image = Image.open(image_path)

display(image)

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

*6

import pandas as pd
from torch.utils.data import Dataset

class Flickr8kAudioImageDataset(Dataset):

def __init__(self, mapping_file, image_dir, audio_dir, transform=None):

self.data = pd.read_csv(mapping_file)

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

def __len__(self):

return len(self.data)

def __getitem__(self, idx):

row = self.data.iloc[idx]

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

caption = row["caption"]

# Load image

image = Image.open(image_path)

if self.transform:

image = self.transform(image)

# Load audio

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

*7

import os
image_dir = "/content/flickr8k_images/Images"

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_filenames = os.listdir(image_dir)

audio_filenames = os.listdir(audio_dir)

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

# Optional: Print the first few files to see which ones exist

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

*8

import os

import torch

from torch.utils.data import Dataset, DataLoader

from PIL import Image

import librosa

from torchvision import transforms

import numpy as np

import matplotlib.pyplot as plt

class Flickr8kAudioImageDataset(Dataset):

def __init__(self, image_dir, audio_dir, transform=None, audio_length=22050):

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames

self.image_filenames = os.listdir(image_dir)

self.audio_filenames = os.listdir(audio_dir)

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

# Sort the audio filenames by their base name (remove suffix and strip extensions)

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

image_to_audio_map = {}

# Map image filenames to corresponding audio files (first occurrence for each)

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if


audio.startswith(image_base)]

if corresponding_audio:

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching


audio file

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

assert len(self.image_filenames) > 0, "No matching image and audio files found"

def __len__(self):

return len(self.image_filenames)

def __getitem__(self, idx):


# Get the image and audio file names

image_filename = self.image_filenames[idx]

audio_filename = self.audio_filenames[idx]

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

image = Image.open(image_path)

# Optionally apply transformations to the image

if self.transform:

image = self.transform(image)

else:

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

image = transform(image)

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

# Ensure the audio length matches the target length by padding or truncating

if len(audio) < self.audio_length:

# Pad with zeros if the audio is shorter than the target length

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

else:

# Truncate if the audio is longer than the target length

audio = audio[:self.audio_length]

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)


# Return image, audio, and the filename (for potential captions or other info)

return image, audio, audio_filename

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Initialize dataset and DataLoader

dataset = Flickr8kAudioImageDataset(

image_dir=image_dir,

audio_dir=audio_dir,

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.figure(figsize=(12, 6))

# Plot image

plt.subplot(1, 2, 1)

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

plt.title(f"Image: {filenames[0]}")
# Plot audio waveform

plt.subplot(1, 2, 2)

plt.plot(audios[0].numpy()) # Convert tensor to numpy for plotting

plt.title(f"Audio waveform: {filenames[0]}")

plt.show()

break # Only display the first batch, remove break to loop over all batches

*9

import torch

import torch.nn as nn

import torch.optim as optim

# Define a simple CNN for image processing

class ImageModel(nn.Module):

def __init__(self):

super(ImageModel, self).__init__()

self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)

self.pool = nn.MaxPool2d(2, 2)

# Use adaptive pooling to handle varying input sizes

self.adaptive_pool = nn.AdaptiveAvgPool2d((8, 8))

self.fc1 = nn.Linear(32 * 8 * 8, 512)

self.fc2 = nn.Linear(512, 128)

def forward(self, x):

x = self.pool(nn.ReLU()(self.conv1(x)))

x = self.adaptive_pool(x) # Adaptive pooling to handle different input sizes

x = x.view(-1, 32 * 8 * 8) # Flatten the output

x = nn.ReLU()(self.fc1(x))

x = self.fc2(x)
return x

# Define an RNN for audio processing

class AudioModel(nn.Module):

def __init__(self, input_size=22050, hidden_size=128):

super(AudioModel, self).__init__()

self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)

self.fc = nn.Linear(hidden_size, 128)

def forward(self, x):

# Ensure input shape is (batch_size, seq_len, input_size)

# x should have shape [batch_size, seq_len, input_size]

out, _ = self.rnn(x) # out has shape (batch_size, seq_len, hidden_size)

# If the LSTM output is 2D (which might happen if you process sequences with fixed length)

# we need to handle this by adding an additional dimension

if out.dim() == 2:

out = out.unsqueeze(1) # Add an extra dimension to make it 3D: [batch_size, 1, hidden_size]

out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)

out = self.fc(out) # Pass through the fully connected layer

return out

# Update the MultimodalModel class

class MultimodalModel(nn.Module):

def __init__(self, num_classes):

super(MultimodalModel, self).__init__()

self.image_model = ImageModel()

self.audio_model = AudioModel()

self.fc = nn.Linear(128 + 128, num_classes) # For multiclass classification


def forward(self, image, audio):

image_features = self.image_model(image)

audio_features = self.audio_model(audio)

combined = torch.cat((image_features, audio_features), dim=1) # Concatenate image and audio


features

output = self.fc(combined)

return output

# In the training loop

criterion = nn.CrossEntropyLoss() # For multiclass classification

# In the forward pass

outputs = model(images, audios)

loss = criterion(outputs, labels) # Ensure labels are integer for multiclass

# Initialize the model

model = MultimodalModel()

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Training loop (simplified)

num_epochs = 10

for epoch in range(num_epochs):

model.train()

for images, audios, filenames in dataloader:

images, audios = images.to(device), audios.to(device) # Send data to GPU if available


optimizer.zero_grad()

# Forward pass

outputs = model(images, audios)

# Calculate loss (use appropriate target)

loss = criterion(outputs, torch.ones_like(outputs).to(device)) # Modify according to your target

# Backward pass

loss.backward()

optimizer.step()

print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

*10

from torch.utils.data import random_split

# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)

validation_split = 0.2

dataset_size = len(dataset)

validation_size = int(validation_split * dataset_size)

train_size = dataset_size - validation_size

# Split the dataset

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

*11

# Create DataLoaders

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)


*12

def evaluate(model, validation_dataloader, criterion, device):

model.eval() # Set model to evaluation mode

validation_loss = 0.0

with torch.no_grad():

for images, audios, filenames in validation_dataloader:

images, audios = images.to(device), audios.to(device)

outputs = model(images, audios)

targets = get_target(filenames) # Implement target fetching logic based on filenames

loss = criterion(outputs, targets.to(device))

validation_loss += loss.item()

avg_validation_loss = validation_loss / len(validation_dataloader)

print(f"Validation Loss: {avg_validation_loss:.4f}")

*13

torch.save(model.state_dict(), 'best_model.pth')

*14

def __getitem__(self, idx):

image = Image.open(self.image_paths[idx]).convert("RGB") # Load image

label = self.labels[idx] # Get the corresponding label

if self.transform:

image = self.transform(image) # Apply transformations

return image, label # Ensure only two values are returned

*15

data = next(iter(train_dataloader))

print(len(data)) # Number of elements in the returned tuple

print(type(data)) # Check the type (it should be a tuple)


*16

images, labels, additional_info = next(iter(train_dataloader))

*17

print(labels.shape)

*18

import matplotlib.pyplot as plt

# Get a batch of images and labels (audio)

images, labels, filenames = next(iter(train_dataloader))

# Get the first audio sample (labels[0] is the audio)

audio = labels[0].cpu().numpy() # Convert the tensor to a numpy array

# Plot the audio waveform

plt.figure(figsize=(10, 4))

plt.plot(audio)

plt.title(f"Audio Waveform of Sample 0")

plt.xlabel("Time (samples)")

plt.ylabel("Amplitude")

plt.show()

*19

# Debugging label extraction

for filename in filenames[:5]:

print(f"Filename: {filename}, Extracted Label: {get_labels_from_filenames([filename])}")

*20

# Check predicted and actual labels


_, predicted = torch.max(outputs, 1)

print(f"Predicted: {predicted}, Actual: {labels}")

*21

# Visualize or print out image and label pairs

for i in range(5):

print(f"Image {i}: {images[i].shape}, Label: {labels[i]}")

*22

def get_labels_from_filenames(filenames):

labels = []

for filename in filenames:

# Example: assuming label is the first part of the filename

# For instance, if filenames are like 'class1_img_1.jpg', extract 'class1'

# You can customize this depending on your dataset

label = filename.split('_')[0] # Taking the first part before underscore

# Convert label to an integer (if needed, here assuming class labels are numeric)

# If labels are categorical, you may want to convert to class index

label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')

labels.append(label)

# Convert labels to a tensor (as long type)

labels = torch.tensor(labels, dtype=torch.long)

return labels

*23

correct = 0

total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch

images = images.to(device)

audios = audios.to(device)

# Extract labels from filenames (adjust this part based on how your dataset is structured)

labels = get_labels_from_filenames(filenames)

labels = labels.to(device) # Ensure labels are moved to the same device

# Forward pass

outputs = model(images, audios)

_, predicted = torch.max(outputs, 1)

# Calculate total and correct predictions

total += labels.size(0)

correct += (predicted == labels).sum().item()

# Calculate and print accuracy

accuracy = 100 * correct / total

print(f'Validation Accuracy: {accuracy}%')

You might also like