Refactor header helper functions and stop parsing comments on non-comment lines(MIT-LCP#393)

cx1111 · web-flow · commit 14248d17e87f · 2022-06-15T18:29:42.000-07:00
diff --git a/wfdb/io/_header.py b/wfdb/io/_header.py
@@ -1,11 +1,11 @@
 import datetime
 import os
 import re
+from typing import List, Tuple
 
 import numpy as np
 import pandas as pd
 
-from wfdb.io import download
 from wfdb.io import _signal
 
 
@@ -872,7 +872,7 @@ def get_sig_name(self):
         return sig_name
 
 
-def wfdb_strptime(time_string):
+def wfdb_strptime(time_string: str) -> datetime.time:
     """
     Given a time string in an acceptable WFDB format, return
     a datetime.time object.
@@ -905,73 +905,45 @@ def wfdb_strptime(time_string):
     return datetime.datetime.strptime(time_string, time_fmt).time()
 
 
-def _read_header_lines(base_record_name, dir_name, pn_dir):
+def parse_header_content(
+    header_content: str,
+) -> Tuple[List[str], List[str]]:
     """
-    Read the lines in a local or remote header file.
+    Parse the text of a header file.
 
     Parameters
     ----------
-    base_record_name : str
-        The base name of the WFDB record to be read, without any file
-        extensions.
-    dir_name : str
-        The local directory location of the header file. This parameter
-        is ignored if `pn_dir` is set.
-    pn_dir : str
-        Option used to stream data from Physionet. The Physionet
-        database directory from which to find the required record files.
-        eg. For record '100' in 'http://physionet.org/content/mitdb'
-        pn_dir='mitdb'.
+    header_content: str
+        The string content of the full header file
 
     Returns
     -------
-    header_lines : list
-        List of strings corresponding to the header lines.
-    comment_lines : list
-        List of strings corresponding to the comment lines.
-
+    header_lines : List[str]
+        A list of all the non-comment lines
+    comment_lines : List[str]
+        A list of all the comment lines
     """
-    file_name = base_record_name + ".hea"
-
-    # Read local file
-    if pn_dir is None:
-        with open(
-            os.path.join(dir_name, file_name), "r", errors="ignore"
-        ) as fp:
-            # Record line followed by signal/segment lines if any
-            header_lines = []
-            # Comment lines
-            comment_lines = []
-            for line in fp:
-                line = line.strip()
-                # Comment line
-                if line.startswith("#"):
-                    comment_lines.append(line)
-                # Non-empty non-comment line = header line.
-                elif line:
-                    # Look for a comment in the line
-                    ci = line.find("#")
-                    if ci > 0:
-                        header_lines.append(line[:ci])
-                        # comment on same line as header line
-                        comment_lines.append(line[ci:])
-                    else:
-                        header_lines.append(line)
-    # Read online header file
-    else:
-        header_lines, comment_lines = download._stream_header(file_name, pn_dir)
+    header_lines, comment_lines = [], []
+    for line in header_content.splitlines():
+        line = line.strip()
+        # Comment line
+        if line.startswith("#"):
+            comment_lines.append(line)
+        # Non-empty non-comment line = header line.
+        elif line:
+            header_lines.append(line)
 
     return header_lines, comment_lines
 
 
-def _parse_record_line(record_line):
+def _parse_record_line(record_line: str) -> dict:
     """
     Extract fields from a record line string into a dictionary.
 
     Parameters
     ----------
     record_line : str
-        The name of the record line that will be used to extact fields.
+        The record line contained in the header file
 
     Returns
     -------
diff --git a/wfdb/io/download.py b/wfdb/io/download.py
@@ -82,9 +82,9 @@ def _remote_file_size(url=None, file_name=None, pn_dir=None):
     return remote_file_size
 
 
-def _stream_header(file_name, pn_dir):
+def _stream_header(file_name: str, pn_dir: str) -> str:
     """
-    Stream the lines of a remote header file.
+    Stream the text of a remote header file.
 
     Parameters
     ----------
@@ -97,10 +97,8 @@ def _stream_header(file_name, pn_dir):
 
     Returns
     -------
-    header_lines : list
-        All of the traditional header lines.
-    comment_lines : list
-        All of the comment header lines.
+    N/A : str
+        The text contained in the header file
 
     """
     # Full url of header location
@@ -110,30 +108,7 @@ def _stream_header(file_name, pn_dir):
     with _url.openurl(url, "rb") as f:
         content = f.read()
 
-    # Get each line as a string
-    filelines = content.decode("iso-8859-1").splitlines()
-
-    # Separate content into header and comment lines
-    header_lines = []
-    comment_lines = []
-
-    for line in filelines:
-        line = str(line.strip())
-        # Comment line
-        if line.startswith("#"):
-            comment_lines.append(line)
-        # Non-empty non-comment line = header line.
-        elif line:
-            # Look for a comment in the line
-            ci = line.find("#")
-            if ci > 0:
-                header_lines.append(line[:ci])
-                # comment on same line as header line
-                comment_lines.append(line[ci:])
-            else:
-                header_lines.append(line)
-
-    return (header_lines, comment_lines)
+    return content.decode("iso-8859-1")
 
 
 def _stream_dat(file_name, pn_dir, byte_count, start_byte, dtype):
diff --git a/wfdb/io/record.py b/wfdb/io/record.py
@@ -1,10 +1,10 @@
 import datetime
 import multiprocessing.dummy
 import posixpath
+import os
 import re
 
 import numpy as np
-import os
 import pandas as pd
 
 from wfdb.io import _header
@@ -639,7 +639,7 @@ def check_read_inputs(
                 "return_res must be one of the following when physical is True: 64, 32, 16"
             )
 
-    def _adjust_datetime(self, sampfrom):
+    def _adjust_datetime(self, sampfrom: int):
         """
         Adjust date and time fields to reflect user input if possible.
 
@@ -1778,16 +1778,28 @@ def rdheader(record_name, pn_dir=None, rd_segments=False):
     dir_name, base_record_name = os.path.split(record_name)
     dir_name = os.path.abspath(dir_name)
 
+    # Construct the download path using the database version
     if (pn_dir is not None) and ("." not in pn_dir):
         dir_list = pn_dir.split("/")
         pn_dir = posixpath.join(
             dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
         )
 
-    # Read the header file. Separate comment and non-comment lines
-    header_lines, comment_lines = _header._read_header_lines(
-        base_record_name, dir_name, pn_dir
-    )
+    # Read the local or remote header file.
+    file_name = f"{base_record_name}.hea"
+    if pn_dir is None:
+        with open(
+            os.path.join(dir_name, file_name),
+            "r",
+            encoding="ascii",
+            errors="ignore",
+        ) as f:
+            header_content = f.read()
+    else:
+        header_content = download._stream_header(file_name, pn_dir)
+
+    # Separate comment and non-comment lines
+    header_lines, comment_lines = _header.parse_header_content(header_content)
 
     # Get fields from record line
     record_fields = _header._parse_record_line(header_lines[0])