Description
Bug summary
When using matplotlib to generate spectrogram visualizations of audio files, if the audio file is too long, the spectrogram portion of the plot becomes blank towards the latter half, while the waveform continues to be displayed properly.
Code for reproduction
import torchaudio
import torchaudio.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
import librosa.display
import matplotlib.gridspec as gridspec
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
def compute_spectrogram_per_channel(channel_waveform, sample_rate):
# Create transformer to convert waveform to spectrogram
spectrogram_transform = transforms.Spectrogram(n_fft=2048, hop_length=256)
# Apply the transformer
spectrogram = spectrogram_transform(channel_waveform.unsqueeze(0))
amplitude_spectrogram = np.sqrt(spectrogram)
db_spectrogram = librosa.amplitude_to_db(amplitude_spectrogram[0].numpy(), ref=np.max)
db_spectrogram = np.clip(db_spectrogram, a_min=None, a_max=0) # clip to 0dB
# Set new log scale
num_freqs, num_frames = db_spectrogram.shape
min_freq = 1 # human hearing range in Hz
max_freq = sample_rate / 2 # Nyquist frequency
frequencies = np.linspace(min_freq, max_freq, num=num_freqs)
# Create a new scale
log_scale = np.log10(frequencies)
linear_scale = np.linspace(np.log10(min_freq), np.log10(max_freq), num=num_freqs)
scale_ratio = 0.75 # adjust this parameter to control the ratio of log scale and linear scale
new_scale = scale_ratio * log_scale + (1 - scale_ratio) * linear_scale
new_db_spectrogram = np.empty_like(db_spectrogram)
# Apply interpolation for each frame
for frame in tqdm(range(num_frames)):
interpolator = scipy.interpolate.interp1d(log_scale, db_spectrogram[:, frame])
new_db_spectrogram[:, frame] = interpolator(new_scale)
return channel_waveform.t().numpy(), db_spectrogram, new_db_spectrogram
def plot_spectrogram(waveforms, new_db_spectrograms, audio_duration):
num_channels = len(waveforms)
# Create a plot, set the background to black and adjust the size based on audio duration
plt.figure(figsize=(max(audio_duration * 2, 10), 8), facecolor="black")
# Dynamically create subplots based on the number of channels
gs = gridspec.GridSpec(2 * num_channels, 1, height_ratios=[1] * num_channels + [5] * num_channels)
# Loop through each channel to plot the waveform and spectrogram
for i in range(num_channels):
# Plot the waveform
ax_waveform = plt.subplot(gs[i])
ax_waveform.plot(waveforms[i], color="#4BF2A7")
nonzero_indices = np.where(waveforms[i] != 0)[0] # Find indices of non-zero values
ax_waveform.set_xlim(nonzero_indices[0], nonzero_indices[-1]) # Set x limit to range of non-zero values
ax_waveform.axis("off")
# Plot the spectrogram
ax_spectrogram = plt.subplot(gs[i + num_channels])
ax_spectrogram.imshow(new_db_spectrograms[i], origin="lower", aspect="auto")
ax_spectrogram.axis("off")
plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0) # Adjust to remove borders and gaps
plt.savefig("spectrogram.jpeg", facecolor="black", bbox_inches="tight", pad_inches=0) # Save the figure
def main(file):
# Load the audio file
waveform, sample_rate = torchaudio.load(file)
num_channels = waveform.shape[0]
audio_duration = waveform.shape[1] / sample_rate # Calculate audio duration
with ProcessPoolExecutor() as executor:
futures = {
executor.submit(compute_spectrogram_per_channel, waveform[ch], sample_rate): ch
for ch in range(num_channels)
}
waveforms = [None] * num_channels
db_spectrograms = [None] * num_channels
new_db_spectrograms = [None] * num_channels
for future in as_completed(futures):
ch = futures[future]
waveforms[ch], db_spectrograms[ch], new_db_spectrograms[ch] = future.result()
# Adjust figure width based on audio duration
plot_spectrogram(waveforms, new_db_spectrograms, audio_duration)
# Run the main function
if __name__ == "__main__":
file = "long.wav"
main(file)
Actual outcome
When visualizing an audio file of significant length, the spectrogram does not render correctly for the entire duration of the audio. The latter part of the spectrogram is blank and contains no information, which doesn't match with the waveform visualization, which continues to display normally.
Expected outcome
The spectrogram should be consistently rendered for the entire duration of the audio file, matching the waveform visualization. Regardless of the length of the audio file, the spectrogram should be complete and not become blank at any point.
Additional information
- Other libraries involved: torchaudio, librosa, numpy, scipy
This issue occurs regardless of the audio file format or the specifics of the audio content. It seems directly related to the length of the audio file.
Operating system
Windows & Ubuntu
Matplotlib Version
3.7.2
Matplotlib Backend
TkAgg
Python version
Python 3.10.10
Jupyter version
No response
Installation
pip