File tree 1 file changed +26
-1
lines changed
1 file changed +26
-1
lines changed Original file line number Diff line number Diff line change 10
10
from lxml import etree
11
11
12
12
from .ns import NamespacePrefixedTag , nsmap
13
+ from ..compat import is_string
13
14
14
15
15
16
# configure XML parser
18
19
oxml_parser .set_element_class_lookup (element_class_lookup )
19
20
20
21
22
+ def remove_hyperlink_tags (xml ):
23
+ """
24
+ Strip hyperlinks from the document to just bare text so it'll be available after parsing.
25
+ This workaround for problem of missing text encased in hyperlinks was proposed by @Brad-Python in
26
+ https://github.com/python-openxml/python-docx/issues/85#issuecomment-62293768
27
+ """
28
+ import re
29
+
30
+ is_bytestring = not is_string (xml )
31
+ if is_bytestring :
32
+ # decode possible bytestring
33
+ xml = xml .decode ('utf-8' )
34
+
35
+ xml = xml .replace ("</w:hyperlink>" ,"" )
36
+ xml = re .sub ('<w:hyperlink[^>]*>' ,"" ,xml )
37
+
38
+ if is_bytestring :
39
+ # encode back to bytestring
40
+ xml = xml .encode ('utf-8' )
41
+
42
+ return xml
43
+
44
+
21
45
def parse_xml (xml ):
22
46
"""
23
47
Return root lxml element obtained by parsing XML character string in
24
48
*xml*, which can be either a Python 2.x string or unicode. The custom
25
49
parser is used, so custom element classes are produced for elements in
26
50
*xml* that have them.
27
51
"""
28
- root_element = etree .fromstring (xml , oxml_parser )
52
+ stripped_xml = remove_hyperlink_tags (xml )
53
+ root_element = etree .fromstring (stripped_xml , oxml_parser )
29
54
return root_element
30
55
31
56
You can’t perform that action at this time.
0 commit comments