Skip to content

Commit e2f6b44

Browse files
Add content filtering package
Co-authored-by: SamMorrowDrums <4811358+SamMorrowDrums@users.noreply.github.com>
1 parent 015b8b6 commit e2f6b44

File tree

2 files changed

+306
-0
lines changed

2 files changed

+306
-0
lines changed

pkg/filtering/content_filter.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
package filtering
2+
3+
import (
4+
"regexp"
5+
"strings"
6+
)
7+
8+
var (
9+
// Invisible Unicode characters
10+
// This includes zero-width spaces, zero-width joiners, zero-width non-joiners,
11+
// bidirectional marks, and other invisible unicode characters
12+
invisibleCharsRegex = regexp.MustCompile(`[\x{200B}-\x{200F}\x{2028}-\x{202E}\x{2060}-\x{2064}\x{FEFF}]`)
13+
14+
// HTML comments
15+
htmlCommentsRegex = regexp.MustCompile(`<!--[\s\S]*?-->`)
16+
17+
// HTML elements that could contain hidden content
18+
// This is a simple approach that targets specific dangerous tags
19+
// Go's regexp doesn't support backreferences, so we list each tag explicitly
20+
htmlScriptRegex = regexp.MustCompile(`<script[^>]*>[\s\S]*?</script>`)
21+
htmlStyleRegex = regexp.MustCompile(`<style[^>]*>[\s\S]*?</style>`)
22+
htmlIframeRegex = regexp.MustCompile(`<iframe[^>]*>[\s\S]*?</iframe>`)
23+
htmlObjectRegex = regexp.MustCompile(`<object[^>]*>[\s\S]*?</object>`)
24+
htmlEmbedRegex = regexp.MustCompile(`<embed[^>]*>[\s\S]*?</embed>`)
25+
htmlSvgRegex = regexp.MustCompile(`<svg[^>]*>[\s\S]*?</svg>`)
26+
htmlMathRegex = regexp.MustCompile(`<math[^>]*>[\s\S]*?</math>`)
27+
htmlLinkRegex = regexp.MustCompile(`<link[^>]*>[\s\S]*?</link>`)
28+
29+
// HTML attributes that might be used for hiding content
30+
htmlAttributesRegex = regexp.MustCompile(`<[^>]*(?:style|data-[\w-]+|hidden|class)="[^"]*"[^>]*>`)
31+
32+
// Detect collapsed sections (details/summary)
33+
collapsedSectionsRegex = regexp.MustCompile(`<details>[\s\S]*?</details>`)
34+
35+
// Very small text (font-size or similar CSS tricks)
36+
smallTextRegex = regexp.MustCompile(`<[^>]*style="[^"]*font-size:\s*(?:0|0\.\d+|[0-3])(?:px|pt|em|%)[^"]*"[^>]*>[\s\S]*?</[^>]+>`)
37+
38+
// Excessive whitespace (more than 3 consecutive newlines)
39+
excessiveWhitespaceRegex = regexp.MustCompile(`\n{4,}`)
40+
)
41+
42+
// Config holds configuration for content filtering
43+
type Config struct {
44+
// DisableContentFiltering disables all content filtering when true
45+
DisableContentFiltering bool
46+
}
47+
48+
// DefaultConfig returns the default content filtering configuration
49+
func DefaultConfig() *Config {
50+
return &Config{
51+
DisableContentFiltering: false,
52+
}
53+
}
54+
55+
// FilterContent filters potentially hidden content from the input text
56+
// This includes invisible Unicode characters, HTML comments, and other methods of hiding content
57+
func FilterContent(input string, cfg *Config) string {
58+
if cfg != nil && cfg.DisableContentFiltering {
59+
return input
60+
}
61+
62+
if input == "" {
63+
return input
64+
}
65+
66+
// Process the input text through each filter
67+
result := input
68+
69+
// Remove invisible characters
70+
result = invisibleCharsRegex.ReplaceAllString(result, "")
71+
72+
// Replace HTML comments with a marker
73+
result = htmlCommentsRegex.ReplaceAllString(result, "[HTML_COMMENT]")
74+
75+
// Replace potentially dangerous HTML elements
76+
result = htmlScriptRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
77+
result = htmlStyleRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
78+
result = htmlIframeRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
79+
result = htmlObjectRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
80+
result = htmlEmbedRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
81+
result = htmlSvgRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
82+
result = htmlMathRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
83+
result = htmlLinkRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
84+
85+
// Replace HTML attributes that might be used for hiding
86+
result = htmlAttributesRegex.ReplaceAllStringFunc(result, cleanHTMLAttributes)
87+
88+
// Replace collapsed sections with visible indicator
89+
result = collapsedSectionsRegex.ReplaceAllStringFunc(result, makeCollapsedSectionVisible)
90+
91+
// Replace very small text with visible indicator
92+
result = smallTextRegex.ReplaceAllString(result, "[SMALL_TEXT]")
93+
94+
// Normalize excessive whitespace
95+
result = excessiveWhitespaceRegex.ReplaceAllString(result, "\n\n\n")
96+
97+
return result
98+
}
99+
100+
// cleanHTMLAttributes removes potentially dangerous attributes from HTML tags
101+
func cleanHTMLAttributes(tag string) string {
102+
// This is a simple implementation that removes style, data-* and hidden attributes
103+
// A more sophisticated implementation would parse the HTML and selectively remove attributes
104+
tagWithoutStyle := regexp.MustCompile(`\s+(?:style|data-[\w-]+|hidden|class)="[^"]*"`).ReplaceAllString(tag, "")
105+
return tagWithoutStyle
106+
}
107+
108+
// makeCollapsedSectionVisible transforms a <details> section to make it visible
109+
func makeCollapsedSectionVisible(detailsSection string) string {
110+
// Extract the summary if present
111+
summaryRegex := regexp.MustCompile(`<summary>(.*?)</summary>`)
112+
summaryMatches := summaryRegex.FindStringSubmatch(detailsSection)
113+
114+
summary := "Collapsed section"
115+
if len(summaryMatches) > 1 {
116+
summary = summaryMatches[1]
117+
}
118+
119+
// Extract the content (everything after </summary> and before </details>)
120+
parts := strings.SplitN(detailsSection, "</summary>", 2)
121+
content := detailsSection
122+
if len(parts) > 1 {
123+
content = parts[1]
124+
content = strings.TrimSuffix(content, "</details>")
125+
} else {
126+
// No summary tag found, remove the details tags
127+
content = strings.TrimPrefix(content, "<details>")
128+
content = strings.TrimSuffix(content, "</details>")
129+
}
130+
131+
// Format as a visible section
132+
return "\n\n**" + summary + ":**\n" + content + "\n\n"
133+
}

pkg/filtering/content_filter_test.go

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package filtering
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestFilterContent(t *testing.T) {
8+
tests := []struct {
9+
name string
10+
input string
11+
expected string
12+
cfg *Config
13+
}{
14+
{
15+
name: "Empty string",
16+
input: "",
17+
expected: "",
18+
cfg: DefaultConfig(),
19+
},
20+
{
21+
name: "Normal text without hidden content",
22+
input: "This is normal text without any hidden content.",
23+
expected: "This is normal text without any hidden content.",
24+
cfg: DefaultConfig(),
25+
},
26+
{
27+
name: "Text with invisible characters",
28+
input: "Hidden\u200Bcharacters\u200Bin\u200Bthis\u200Btext",
29+
expected: "Hiddencharactersinthistext",
30+
cfg: DefaultConfig(),
31+
},
32+
{
33+
name: "Text with HTML comments",
34+
input: "This has a <!-- hidden comment --> in it.",
35+
expected: "This has a [HTML_COMMENT] in it.",
36+
cfg: DefaultConfig(),
37+
},
38+
{
39+
name: "Text with HTML elements",
40+
input: "This has <script>alert('hidden')</script> scripts.",
41+
expected: "This has [HTML_ELEMENT] scripts.",
42+
cfg: DefaultConfig(),
43+
},
44+
{
45+
name: "Text with details/summary",
46+
input: "Collapsed content: <details><summary>Click me</summary>Hidden content</details>",
47+
expected: "Collapsed content: \n\n**Click me:**\nHidden content\n\n",
48+
cfg: DefaultConfig(),
49+
},
50+
{
51+
name: "Text with small font",
52+
input: "This has <span style=\"font-size:1px\">hidden tiny text</span> in it.",
53+
expected: "This has <span>hidden tiny text</span> in it.",
54+
cfg: DefaultConfig(),
55+
},
56+
{
57+
name: "Text with excessive whitespace",
58+
input: "Line 1\n\n\n\n\n\nLine 2",
59+
expected: "Line 1\n\n\nLine 2",
60+
cfg: DefaultConfig(),
61+
},
62+
{
63+
name: "Text with HTML attributes",
64+
input: "<p data-hidden=\"true\" style=\"display:none\">Hidden paragraph</p>",
65+
expected: "<p>Hidden paragraph</p>",
66+
cfg: DefaultConfig(),
67+
},
68+
{
69+
name: "Filtering disabled",
70+
input: "Hidden\u200Bcharacters and <!-- comments -->",
71+
expected: "Hidden\u200Bcharacters and <!-- comments -->",
72+
cfg: &Config{DisableContentFiltering: true},
73+
},
74+
{
75+
name: "Nil config uses default (filtering enabled)",
76+
input: "Hidden\u200Bcharacters",
77+
expected: "Hiddencharacters",
78+
cfg: nil,
79+
},
80+
{
81+
name: "Normal markdown with code blocks",
82+
input: "# Title\n\n```go\nfunc main() {\n fmt.Println(\"Hello, world!\")\n}\n```",
83+
expected: "# Title\n\n```go\nfunc main() {\n fmt.Println(\"Hello, world!\")\n}\n```",
84+
cfg: DefaultConfig(),
85+
},
86+
{
87+
name: "GitHub flavored markdown with tables",
88+
input: "| Header 1 | Header 2 |\n| -------- | -------- |\n| Cell 1 | Cell 2 |",
89+
expected: "| Header 1 | Header 2 |\n| -------- | -------- |\n| Cell 1 | Cell 2 |",
90+
cfg: DefaultConfig(),
91+
},
92+
}
93+
94+
for _, tc := range tests {
95+
t.Run(tc.name, func(t *testing.T) {
96+
result := FilterContent(tc.input, tc.cfg)
97+
if result != tc.expected {
98+
t.Errorf("FilterContent() = %q, want %q", result, tc.expected)
99+
}
100+
})
101+
}
102+
}
103+
104+
func TestMakeCollapsedSectionVisible(t *testing.T) {
105+
tests := []struct {
106+
name string
107+
input string
108+
expected string
109+
}{
110+
{
111+
name: "Simple details/summary",
112+
input: "<details><summary>Click me</summary>Hidden content</details>",
113+
expected: "\n\n**Click me:**\nHidden content\n\n",
114+
},
115+
{
116+
name: "Details without summary",
117+
input: "<details>Hidden content</details>",
118+
expected: "\n\n**Collapsed section:**\nHidden content\n\n",
119+
},
120+
{
121+
name: "Nested content",
122+
input: "<details><summary>Outer</summary>Content<details><summary>Inner</summary>Nested</details></details>",
123+
expected: "\n\n**Outer:**\nContent<details><summary>Inner</summary>Nested</details>\n\n",
124+
},
125+
}
126+
127+
for _, tc := range tests {
128+
t.Run(tc.name, func(t *testing.T) {
129+
result := makeCollapsedSectionVisible(tc.input)
130+
if result != tc.expected {
131+
t.Errorf("makeCollapsedSectionVisible() = %q, want %q", result, tc.expected)
132+
}
133+
})
134+
}
135+
}
136+
137+
func TestCleanHTMLAttributes(t *testing.T) {
138+
tests := []struct {
139+
name string
140+
input string
141+
expected string
142+
}{
143+
{
144+
name: "Tag with style attribute",
145+
input: "<p style=\"display:none\">Hidden</p>",
146+
expected: "<p>Hidden</p>",
147+
},
148+
{
149+
name: "Tag with data attribute",
150+
input: "<p data-hidden=\"true\">Hidden</p>",
151+
expected: "<p>Hidden</p>",
152+
},
153+
{
154+
name: "Tag with multiple attributes",
155+
input: "<p id=\"para\" style=\"display:none\" data-test=\"value\">Hidden</p>",
156+
expected: "<p id=\"para\">Hidden</p>",
157+
},
158+
{
159+
name: "Tag with allowed attributes",
160+
input: "<a href=\"https://example.com\" target=\"_blank\">Link</a>",
161+
expected: "<a href=\"https://example.com\" target=\"_blank\">Link</a>",
162+
},
163+
}
164+
165+
for _, tc := range tests {
166+
t.Run(tc.name, func(t *testing.T) {
167+
result := cleanHTMLAttributes(tc.input)
168+
if result != tc.expected {
169+
t.Errorf("cleanHTMLAttributes() = %q, want %q", result, tc.expected)
170+
}
171+
})
172+
}
173+
}

0 commit comments

Comments
 (0)