@@ -33,7 +33,7 @@ public function parsePageHtml(string $filename, string $spaceName): array
33
33
];
34
34
}
35
35
36
- public function htmlFile2Markdown (string $ filename )
36
+ public function htmlFile2Markdown (string $ filename ): string
37
37
{
38
38
libxml_use_internal_errors (true );
39
39
$ this ->document ->loadHTMLFile ($ filename );
@@ -42,6 +42,39 @@ public function htmlFile2Markdown(string $filename)
42
42
return $ this ->htmlConverter ->convert ($ html );
43
43
}
44
44
45
+ /**
46
+ * parse attachments. if markdown is not empty, ignore images in it.
47
+ */
48
+ public function parseAttachments ($ htmlFilename , $ markdownContent = '' ): array
49
+ {
50
+ libxml_use_internal_errors (true );
51
+ $ this ->document ->loadHTMLFile ($ htmlFilename );
52
+ $ divElements = $ this ->document ->getElementById ('content ' )->getElementsByTagName ('div ' );
53
+ $ divElement = null ;
54
+ foreach ($ divElements as $ divElement ) {
55
+ if ($ divElement ->getAttribute ('class ' ) != 'pageSection ' ) {
56
+ continue ;
57
+ }
58
+ $ h2Element = $ divElement ->getElementsByTagName ('h2 ' )[0 ];
59
+ if (!empty ($ h2Element ) && $ h2Element ->id == 'attachments ' ) {
60
+ break ;
61
+ }
62
+ }
63
+ if (empty ($ divElement )) {
64
+ return [];
65
+ }
66
+ $ aElements = $ divElement ->getElementsByTagName ('a ' );
67
+ $ attachments = [];
68
+ foreach ($ aElements as $ aElement ) {
69
+ $ filePath = $ aElement ->getAttribute ('href ' );
70
+ $ filename = $ aElement ->nodeValue ;
71
+ if (!str_contains ($ markdownContent , ") {
72
+ $ attachments [$ filePath ] = $ filename ;
73
+ }
74
+ }
75
+ return $ attachments ;
76
+ }
77
+
45
78
/**
46
79
* @return array ['tree' => "array", 'titles' => "array"]
47
80
*/
0 commit comments