Skip to content

Commit 476cf8c

Browse files
committed
feat: #30 parse attachments
1 parent 1e30945 commit 476cf8c

File tree

2 files changed

+65
-1
lines changed

2 files changed

+65
-1
lines changed

app/Confluence.php

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public function parsePageHtml(string $filename, string $spaceName): array
3333
];
3434
}
3535

36-
public function htmlFile2Markdown(string $filename)
36+
public function htmlFile2Markdown(string $filename): string
3737
{
3838
libxml_use_internal_errors(true);
3939
$this->document->loadHTMLFile($filename);
@@ -42,6 +42,39 @@ public function htmlFile2Markdown(string $filename)
4242
return $this->htmlConverter->convert($html);
4343
}
4444

45+
/**
46+
* parse attachments. if markdown is not empty, ignore images in it.
47+
*/
48+
public function parseAttachments($htmlFilename, $markdownContent = ''): array
49+
{
50+
libxml_use_internal_errors(true);
51+
$this->document->loadHTMLFile($htmlFilename);
52+
$divElements = $this->document->getElementById('content')->getElementsByTagName('div');
53+
$divElement = null;
54+
foreach ($divElements as $divElement) {
55+
if ($divElement->getAttribute('class') != 'pageSection') {
56+
continue;
57+
}
58+
$h2Element = $divElement->getElementsByTagName('h2')[0];
59+
if (!empty($h2Element) && $h2Element->id == 'attachments') {
60+
break;
61+
}
62+
}
63+
if (empty($divElement)) {
64+
return [];
65+
}
66+
$aElements = $divElement->getElementsByTagName('a');
67+
$attachments = [];
68+
foreach ($aElements as $aElement) {
69+
$filePath = $aElement->getAttribute('href');
70+
$filename = $aElement->nodeValue;
71+
if (!str_contains($markdownContent, "![](${filePath}")) {
72+
$attachments[$filePath] = $filename;
73+
}
74+
}
75+
return $attachments;
76+
}
77+
4578
/**
4679
* @return array ['tree' => "array", 'titles' => "array"]
4780
*/

tests/Unit/ConfluenceTest.php

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,35 @@ public function testParsePagesTree()
6767
]
6868
], $tree);
6969
}
70+
71+
public function testParseAttachmentsIgnoreImages()
72+
{
73+
$confluence = new Confluence();
74+
$htmlFilePath = $this->dataDir . 'confluence/space1/image-demo_65619.html';
75+
$markdown = $confluence->htmlFile2Markdown($htmlFilePath);
76+
$attachments = $confluence->parseAttachments($htmlFilePath, $markdown);
77+
$this->assertEquals([], $attachments);
78+
}
79+
80+
public function testParseAttachmentsNoIgnoreImages()
81+
{
82+
$confluence = new Confluence();
83+
$htmlFilePath = $this->dataDir . 'confluence/space1/image-demo_65619.html';
84+
$attachments = $confluence->parseAttachments($htmlFilePath);
85+
$this->assertEquals([
86+
'attachments/65619/65623.png' => 'github-ubuntu-16.04.png',
87+
'attachments/65619/65624.png' => 'coding-logo.png',
88+
], $attachments);
89+
}
90+
91+
public function testParseAttachmentsSuccess()
92+
{
93+
$confluence = new Confluence();
94+
$htmlFilePath = $this->dataDir . 'confluence/space1/attachment-demo_65615.html';
95+
$markdown = $confluence->htmlFile2Markdown($htmlFilePath);
96+
$attachments = $confluence->parseAttachments($htmlFilePath, $markdown);
97+
$this->assertEquals([
98+
'attachments/65615/65616.txt' => 'Lorem Ipsum 2021-06-08T10_55_27+0800.txt'
99+
], $attachments);
100+
}
70101
}

0 commit comments

Comments
 (0)