|
1 | 1 | import { Readability } from '@mozilla/readability';
|
2 | 2 | import { JSDOM } from 'jsdom';
|
3 | 3 | import { Page } from 'playwright';
|
| 4 | +import { ToolContext } from '../../../core/types.js'; |
4 | 5 |
|
5 | 6 | const OUTPUT_LIMIT = 11 * 1024; // 10KB limit
|
6 | 7 |
|
7 | 8 | /**
|
8 | 9 | * Returns the raw HTML content of the page without any processing
|
9 | 10 | */
|
10 |
| -async function getNoneProcessedDOM(page: Page): Promise<string> { |
11 |
| - return await page.content(); |
| 11 | +async function getRawDOM(page: Page): Promise<string> { |
| 12 | + const content = await page.content(); |
| 13 | + return content; |
12 | 14 | }
|
13 | 15 |
|
14 | 16 | /**
|
15 |
| - * Processes the page using Mozilla's Readability to extract the main content |
16 |
| - * Falls back to simple processing if Readability fails |
| 17 | + * Uses an LLM to extract the main content from a page and format it as markdown |
17 | 18 | */
|
18 |
| -async function getReadabilityProcessedDOM(page: Page): Promise<string> { |
| 19 | +async function getSmartMarkdownContent(page: Page, context: ToolContext): Promise<string> { |
19 | 20 | try {
|
20 | 21 | const html = await page.content();
|
21 | 22 | const url = page.url();
|
22 |
| - const dom = new JSDOM(html, { url }); |
23 |
| - const reader = new Readability(dom.window.document); |
24 |
| - const article = reader.parse(); |
| 23 | + |
| 24 | + // Create a system prompt for the LLM |
| 25 | + const systemPrompt = `You are an expert at extracting the main content from web pages. |
| 26 | +Given the HTML content of a webpage, extract only the main informative content. |
| 27 | +Format the extracted content as clean, well-structured markdown. |
| 28 | +Ignore headers, footers, navigation, sidebars, ads, and other non-content elements. |
| 29 | +Preserve the important headings, paragraphs, lists, and other content structures. |
| 30 | +Do not include any explanations or descriptions about what you're doing. |
| 31 | +Just return the extracted content as markdown.`; |
25 | 32 |
|
26 |
| - if (!article) { |
27 |
| - console.warn( |
28 |
| - 'Readability could not parse the page, falling back to simple mode', |
29 |
| - ); |
30 |
| - return getSimpleProcessedDOM(page); |
| 33 | + // Use the configured LLM to extract the content |
| 34 | + const { provider, model, apiKey, baseUrl } = context; |
| 35 | + |
| 36 | + if (!provider || !model) { |
| 37 | + context.logger.warn('LLM provider or model not available, falling back to raw DOM'); |
| 38 | + return getRawDOM(page); |
31 | 39 | }
|
32 | 40 |
|
33 |
| - // Return a formatted version of the article |
34 |
| - return JSON.stringify( |
35 |
| - { |
36 |
| - url: url, |
37 |
| - title: article.title || '', |
38 |
| - content: article.content || '', |
39 |
| - textContent: article.textContent || '', |
40 |
| - excerpt: article.excerpt || '', |
41 |
| - byline: article.byline || '', |
42 |
| - dir: article.dir || '', |
43 |
| - siteName: article.siteName || '', |
44 |
| - length: article.length || 0, |
45 |
| - }, |
46 |
| - null, |
47 |
| - 2, |
48 |
| - ); |
| 41 | + try { |
| 42 | + // Import the createProvider function from the provider module |
| 43 | + const { createProvider } = await import('../../../core/llm/provider.js'); |
| 44 | + |
| 45 | + // Create a provider instance using the provider abstraction |
| 46 | + const llmProvider = createProvider(provider, model, { |
| 47 | + apiKey, |
| 48 | + baseUrl |
| 49 | + }); |
| 50 | + |
| 51 | + // Generate text using the provider |
| 52 | + const response = await llmProvider.generateText({ |
| 53 | + messages: [ |
| 54 | + { |
| 55 | + role: 'system', |
| 56 | + content: systemPrompt |
| 57 | + }, |
| 58 | + { |
| 59 | + role: 'user', |
| 60 | + content: `URL: ${url}\n\nHTML content:\n${html}` |
| 61 | + } |
| 62 | + ], |
| 63 | + temperature: 0.3, |
| 64 | + maxTokens: 4000 |
| 65 | + }); |
| 66 | + |
| 67 | + // Extract the markdown content from the response |
| 68 | + const markdown = response.text; |
| 69 | + |
| 70 | + if (!markdown) { |
| 71 | + context.logger.warn('LLM returned empty content, falling back to raw DOM'); |
| 72 | + return getRawDOM(page); |
| 73 | + } |
| 74 | + |
| 75 | + // Log token usage for monitoring |
| 76 | + context.logger.debug(`Token usage for content extraction: ${JSON.stringify(response.tokenUsage)}`); |
| 77 | + |
| 78 | + return markdown; |
| 79 | + } catch (llmError) { |
| 80 | + context.logger.error('Error using LLM provider for content extraction:', llmError); |
| 81 | + return getRawDOM(page); |
| 82 | + } |
49 | 83 | } catch (error) {
|
50 |
| - console.error('Error using Readability:', error); |
51 |
| - // Fallback to simple mode if Readability fails |
52 |
| - return getSimpleProcessedDOM(page); |
| 84 | + context.logger.error('Error using LLM for content extraction:', error); |
| 85 | + // Fallback to raw mode if LLM processing fails |
| 86 | + return getRawDOM(page); |
53 | 87 | }
|
54 | 88 | }
|
55 | 89 |
|
56 |
| -/** |
57 |
| - * Processes the page by removing invisible elements and non-visual tags |
58 |
| - */ |
59 |
| -async function getSimpleProcessedDOM(page: Page): Promise<string> { |
60 |
| - const domContent = await page.evaluate(() => { |
61 |
| - const clone = document.documentElement; |
62 |
| - |
63 |
| - const elements = clone.querySelectorAll('*'); |
64 |
| - |
65 |
| - const elementsToRemove: Element[] = []; |
66 |
| - elements.forEach((element) => { |
67 |
| - const computedStyle = window.getComputedStyle(element); |
68 |
| - const isVisible = |
69 |
| - computedStyle.display !== 'none' && |
70 |
| - computedStyle.visibility !== 'hidden' && |
71 |
| - computedStyle.opacity !== '0'; |
72 |
| - |
73 |
| - if (!isVisible) { |
74 |
| - elementsToRemove.push(element); |
75 |
| - } |
76 |
| - }); |
77 |
| - |
78 |
| - const nonVisualTags = clone.querySelectorAll( |
79 |
| - 'noscript, iframe, link[rel="stylesheet"], meta, svg, img, symbol, path, style, script', |
80 |
| - ); |
81 |
| - nonVisualTags.forEach((element) => elementsToRemove.push(element)); |
82 |
| - |
83 |
| - elementsToRemove.forEach((element) => element.remove()); |
84 |
| - |
85 |
| - return clone.outerHTML; |
86 |
| - }); |
87 |
| - |
88 |
| - return domContent.replace(/\n/g, '').replace(/\s+/g, ' '); |
89 |
| -} |
90 |
| - |
91 | 90 | /**
|
92 | 91 | * Gets the rendered DOM of a page with specified processing method
|
93 | 92 | */
|
94 | 93 | export async function filterPageContent(
|
95 | 94 | page: Page,
|
96 |
| - pageFilter: 'simple' | 'none' | 'readability', |
| 95 | + pageFilter: 'raw' | 'smartMarkdown', |
| 96 | + context?: ToolContext |
97 | 97 | ): Promise<string> {
|
98 | 98 | let result: string = '';
|
| 99 | + |
99 | 100 | switch (pageFilter) {
|
100 |
| - case 'none': |
101 |
| - result = await getNoneProcessedDOM(page); |
102 |
| - break; |
103 |
| - case 'readability': |
104 |
| - result = await getReadabilityProcessedDOM(page); |
| 101 | + case 'smartMarkdown': |
| 102 | + if (!context) { |
| 103 | + console.warn('ToolContext required for smartMarkdown filter but not provided, falling back to raw mode'); |
| 104 | + result = await getRawDOM(page); |
| 105 | + } else { |
| 106 | + result = await getSmartMarkdownContent(page, context); |
| 107 | + } |
105 | 108 | break;
|
106 |
| - case 'simple': |
| 109 | + case 'raw': |
107 | 110 | default:
|
108 |
| - result = await getSimpleProcessedDOM(page); |
| 111 | + result = await getRawDOM(page); |
109 | 112 | break;
|
110 | 113 | }
|
111 | 114 |
|
112 |
| - if (result.length > OUTPUT_LIMIT) { |
113 |
| - return result.slice(0, OUTPUT_LIMIT) + '...(truncated)'; |
| 115 | + // Ensure result is a string before checking length |
| 116 | + const resultString = result || ''; |
| 117 | + if (resultString.length > OUTPUT_LIMIT) { |
| 118 | + return resultString.slice(0, OUTPUT_LIMIT) + '...(truncated)'; |
114 | 119 | }
|
115 |
| - return result; |
| 120 | + return resultString; |
116 | 121 | }
|
0 commit comments