Merge pull request MIT-LCP#397 into main

Benjamin Moody · Benjamin Moody · commit 6318b12f6ec2 · 2022-06-24T15:50:17.000-04:00
_rd_compressed_file fails when the number of samples to read exceeds
2**24 and the number of channels isn't a power of two.

This is a bug in libsndfile, and although it should be fixed there and
perhaps could be worked around in python-soundfile, it's most
expedient to work around it here for now.
diff --git a/sample-data/flac_3_constant.dat b/sample-data/flac_3_constant.dat
diff --git a/sample-data/flac_3_constant.hea b/sample-data/flac_3_constant.hea
@@ -0,0 +1,4 @@
+flac_3_constant 3 250 5600000
+flac_3_constant.dat 508 200 8 0 25 15104 0 col 0
+flac_3_constant.dat 508 200 8 0 50 30208 0 col 1
+flac_3_constant.dat 508 200 8 0 75 -20224 0 col 2
diff --git a/tests/test_record.py b/tests/test_record.py
@@ -249,6 +249,28 @@ def test_read_flac(self):
                         f"Mismatch in {name}",
                     )
 
+    def test_read_flac_longduration(self):
+        """
+        Three signals multiplexed in a FLAC file, over 2**24 samples.
+
+        Input file created with:
+            yes 25 50 75 | head -5600000 |
+            wrsamp -O 508 -o flac_3_constant 0 1 2
+
+        Note that the total number of samples (across the three
+        channels) exceeds 2**24.  There is a bug in libsndfile that
+        causes it to break if we try to read more than 2**24 total
+        samples at a time, when the number of channels is not a power
+        of two.
+        """
+        record = wfdb.rdrecord("sample-data/flac_3_constant")
+        sig_target = np.repeat(
+            np.array([[0.125, 0.25, 0.375]], dtype="float64"),
+            5600000,
+            axis=0,
+        )
+        np.testing.assert_array_equal(record.p_signal, sig_target)
+
     # ------------------ 2. Special format records ------------------ #
 
     def test_2a(self):
diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -1875,7 +1875,23 @@ def _rd_compressed_file(
             start_samp = start_frame * samps_per_frame[0]
             end_samp = end_frame * samps_per_frame[0]
             sf.seek(start_samp + sample_offset)
-            sig_data = sf.read(end_samp - start_samp, dtype=read_dtype)
+
+            # We could do this:
+            #  sig_data = sf.read(end_samp - start_samp, dtype=read_dtype)
+            # However, sf.read fails for huge blocks (over 2**24 total
+            # samples) due to a bug in libsndfile:
+            # https://github.com/libsndfile/libsndfile/issues/431
+            # So read the data in chunks instead.
+            n_samp = end_samp - start_samp
+            sig_data = np.empty((n_samp, n_sig), dtype=read_dtype)
+            CHUNK_SIZE = 1024 * 1024
+            for chunk_start in range(0, n_samp, CHUNK_SIZE):
+                chunk_end = chunk_start + CHUNK_SIZE
+                chunk_data = sf.read(out=sig_data[chunk_start:chunk_end])
+                samples_read = chunk_data.shape[0]
+                if samples_read != CHUNK_SIZE:
+                    sig_data = sig_data[: chunk_start + samples_read]
+                    break
 
             # If we read an 8-bit stream as int16 or a 24-bit stream as
             # int32, soundfile shifts each sample left by 8 bits.  We