Skip to content

Commit 336ed9f

Browse files
committed
update parsing with Brad-Python's patch to strip hyperlinks down to bare text
1 parent e784a73 commit 336ed9f

File tree

1 file changed

+26
-1
lines changed

1 file changed

+26
-1
lines changed

docx/oxml/__init__.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from lxml import etree
1111

1212
from .ns import NamespacePrefixedTag, nsmap
13+
from ..compat import is_string
1314

1415

1516
# configure XML parser
@@ -18,14 +19,38 @@
1819
oxml_parser.set_element_class_lookup(element_class_lookup)
1920

2021

22+
def remove_hyperlink_tags(xml):
23+
"""
24+
Strip hyperlinks from the document to just bare text so it'll be available after parsing.
25+
This workaround for problem of missing text encased in hyperlinks was proposed by @Brad-Python in
26+
https://github.com/python-openxml/python-docx/issues/85#issuecomment-62293768
27+
"""
28+
import re
29+
30+
is_bytestring = not is_string(xml)
31+
if is_bytestring:
32+
# decode possible bytestring
33+
xml = xml.decode('utf-8')
34+
35+
xml = xml.replace("</w:hyperlink>","")
36+
xml = re.sub('<w:hyperlink[^>]*>',"",xml)
37+
38+
if is_bytestring:
39+
# encode back to bytestring
40+
xml = xml.encode('utf-8')
41+
42+
return xml
43+
44+
2145
def parse_xml(xml):
2246
"""
2347
Return root lxml element obtained by parsing XML character string in
2448
*xml*, which can be either a Python 2.x string or unicode. The custom
2549
parser is used, so custom element classes are produced for elements in
2650
*xml* that have them.
2751
"""
28-
root_element = etree.fromstring(xml, oxml_parser)
52+
stripped_xml = remove_hyperlink_tags(xml)
53+
root_element = etree.fromstring(stripped_xml, oxml_parser)
2954
return root_element
3055

3156

0 commit comments

Comments
 (0)