update parsing with Brad-Python's patch to strip hyperlinks down to bare text

q210 · q210 · commit 336ed9fed27f · 2017-09-22T15:46:16.000+03:00
diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py
@@ -10,6 +10,7 @@
 from lxml import etree
 
 from .ns import NamespacePrefixedTag, nsmap
+from ..compat import is_string
 
 
 # configure XML parser
@@ -18,14 +19,38 @@
 oxml_parser.set_element_class_lookup(element_class_lookup)
 
 
+def remove_hyperlink_tags(xml):
+    """
+    Strip hyperlinks from the document to just bare text so it'll be available after parsing.
+    This workaround for problem of missing text encased in hyperlinks was proposed by @Brad-Python in
+    https://github.com/python-openxml/python-docx/issues/85#issuecomment-62293768
+    """
+    import re
+
+    is_bytestring = not is_string(xml)
+    if is_bytestring:
+        # decode possible bytestring
+        xml = xml.decode('utf-8')
+
+    xml = xml.replace("</w:hyperlink>","")
+    xml = re.sub('<w:hyperlink[^>]*>',"",xml)
+
+    if is_bytestring:
+        # encode back to bytestring
+        xml = xml.encode('utf-8')
+
+    return xml
+
+
 def parse_xml(xml):
     """
     Return root lxml element obtained by parsing XML character string in
     *xml*, which can be either a Python 2.x string or unicode. The custom
     parser is used, so custom element classes are produced for elements in
     *xml* that have them.
     """
-    root_element = etree.fromstring(xml, oxml_parser)
+    stripped_xml = remove_hyperlink_tags(xml)
+    root_element = etree.fromstring(stripped_xml, oxml_parser)
     return root_element