Skip to content

Commit be061b5

Browse files
committed
fix(session): use LLM provider abstraction for content extraction
1 parent cb5434b commit be061b5

File tree

5 files changed

+249
-104
lines changed

5 files changed

+249
-104
lines changed

packages/agent/src/core/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { ModelProvider } from './toolAgent/config.js';
1111

1212
export type TokenLevel = 'debug' | 'info' | 'log' | 'warn' | 'error';
1313

14-
export type pageFilter = 'simple' | 'none' | 'readability';
14+
export type pageFilter = 'raw' | 'smartMarkdown';
1515

1616
export type ToolContext = {
1717
logger: Logger;
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2+
import { Page } from 'playwright';
3+
import { filterPageContent } from './filterPageContent';
4+
import { ToolContext } from '../../../core/types';
5+
6+
// HTML content to use in tests
7+
const HTML_CONTENT = '<html><body><h1>Test Content</h1></body></html>';
8+
const MARKDOWN_CONTENT = '# Test Content\n\nThis is the extracted content from the page.';
9+
10+
// Mock the Page object
11+
const mockPage = {
12+
content: vi.fn().mockResolvedValue(HTML_CONTENT),
13+
url: vi.fn().mockReturnValue('https://example.com'),
14+
evaluate: vi.fn(),
15+
} as unknown as Page;
16+
17+
// Mock fetch for LLM calls
18+
global.fetch = vi.fn();
19+
20+
describe('filterPageContent', () => {
21+
let mockContext: ToolContext;
22+
23+
beforeEach(() => {
24+
mockContext = {
25+
logger: {
26+
debug: vi.fn(),
27+
log: vi.fn(),
28+
warn: vi.fn(),
29+
error: vi.fn(),
30+
info: vi.fn(),
31+
},
32+
provider: 'openai',
33+
model: 'gpt-4',
34+
apiKey: 'test-api-key',
35+
baseUrl: 'https://api.openai.com/v1/chat/completions',
36+
maxTokens: 4000,
37+
temperature: 0.3,
38+
} as unknown as ToolContext;
39+
40+
// Reset mocks
41+
vi.resetAllMocks();
42+
43+
// Mock the content method to return the HTML_CONTENT
44+
mockPage.content.mockResolvedValue(HTML_CONTENT);
45+
46+
// Mock fetch to return a successful response
47+
(global.fetch as any).mockResolvedValue({
48+
ok: true,
49+
json: async () => ({
50+
choices: [
51+
{
52+
message: {
53+
content: MARKDOWN_CONTENT,
54+
},
55+
},
56+
],
57+
}),
58+
});
59+
});
60+
61+
afterEach(() => {
62+
vi.clearAllMocks();
63+
});
64+
65+
it('should return raw DOM content with raw filter', async () => {
66+
const result = await filterPageContent(mockPage, 'raw', mockContext);
67+
68+
expect(mockPage.content).toHaveBeenCalled();
69+
expect(result).toEqual(HTML_CONTENT);
70+
});
71+
72+
it('should use LLM to extract content with smartMarkdown filter', async () => {
73+
const result = await filterPageContent(mockPage, 'smartMarkdown', mockContext);
74+
75+
expect(mockPage.content).toHaveBeenCalled();
76+
expect(global.fetch).toHaveBeenCalledWith(
77+
'https://api.openai.com/v1/chat/completions',
78+
expect.objectContaining({
79+
method: 'POST',
80+
headers: expect.objectContaining({
81+
'Authorization': 'Bearer test-api-key',
82+
}),
83+
body: expect.any(String),
84+
})
85+
);
86+
87+
// Verify the result is the markdown content from the LLM
88+
expect(result).toEqual(MARKDOWN_CONTENT);
89+
});
90+
91+
it('should fall back to raw DOM if LLM call fails', async () => {
92+
// Mock fetch to return an error
93+
(global.fetch as any).mockResolvedValue({
94+
ok: false,
95+
text: async () => 'API Error',
96+
});
97+
98+
const result = await filterPageContent(mockPage, 'smartMarkdown', mockContext);
99+
100+
expect(mockPage.content).toHaveBeenCalled();
101+
expect(mockContext.logger.error).toHaveBeenCalled();
102+
expect(result).toEqual(HTML_CONTENT);
103+
});
104+
105+
it('should fall back to raw DOM if context is not provided for smartMarkdown', async () => {
106+
// Create a minimal mock context with just a logger to prevent errors
107+
const minimalContext = {
108+
logger: {
109+
debug: vi.fn(),
110+
log: vi.fn(),
111+
warn: vi.fn(),
112+
error: vi.fn(),
113+
info: vi.fn(),
114+
}
115+
} as unknown as ToolContext;
116+
117+
const result = await filterPageContent(mockPage, 'smartMarkdown', minimalContext);
118+
119+
expect(mockPage.content).toHaveBeenCalled();
120+
expect(minimalContext.logger.warn).toHaveBeenCalled();
121+
expect(result).toEqual(HTML_CONTENT);
122+
});
123+
});
Lines changed: 83 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,121 @@
11
import { Readability } from '@mozilla/readability';
22
import { JSDOM } from 'jsdom';
33
import { Page } from 'playwright';
4+
import { ToolContext } from '../../../core/types.js';
45

56
const OUTPUT_LIMIT = 11 * 1024; // 10KB limit
67

78
/**
89
* Returns the raw HTML content of the page without any processing
910
*/
10-
async function getNoneProcessedDOM(page: Page): Promise<string> {
11-
return await page.content();
11+
async function getRawDOM(page: Page): Promise<string> {
12+
const content = await page.content();
13+
return content;
1214
}
1315

1416
/**
15-
* Processes the page using Mozilla's Readability to extract the main content
16-
* Falls back to simple processing if Readability fails
17+
* Uses an LLM to extract the main content from a page and format it as markdown
1718
*/
18-
async function getReadabilityProcessedDOM(page: Page): Promise<string> {
19+
async function getSmartMarkdownContent(page: Page, context: ToolContext): Promise<string> {
1920
try {
2021
const html = await page.content();
2122
const url = page.url();
22-
const dom = new JSDOM(html, { url });
23-
const reader = new Readability(dom.window.document);
24-
const article = reader.parse();
23+
24+
// Create a system prompt for the LLM
25+
const systemPrompt = `You are an expert at extracting the main content from web pages.
26+
Given the HTML content of a webpage, extract only the main informative content.
27+
Format the extracted content as clean, well-structured markdown.
28+
Ignore headers, footers, navigation, sidebars, ads, and other non-content elements.
29+
Preserve the important headings, paragraphs, lists, and other content structures.
30+
Do not include any explanations or descriptions about what you're doing.
31+
Just return the extracted content as markdown.`;
2532

26-
if (!article) {
27-
console.warn(
28-
'Readability could not parse the page, falling back to simple mode',
29-
);
30-
return getSimpleProcessedDOM(page);
33+
// Use the configured LLM to extract the content
34+
const { provider, model, apiKey, baseUrl } = context;
35+
36+
if (!provider || !model) {
37+
context.logger.warn('LLM provider or model not available, falling back to raw DOM');
38+
return getRawDOM(page);
3139
}
3240

33-
// Return a formatted version of the article
34-
return JSON.stringify(
35-
{
36-
url: url,
37-
title: article.title || '',
38-
content: article.content || '',
39-
textContent: article.textContent || '',
40-
excerpt: article.excerpt || '',
41-
byline: article.byline || '',
42-
dir: article.dir || '',
43-
siteName: article.siteName || '',
44-
length: article.length || 0,
45-
},
46-
null,
47-
2,
48-
);
41+
try {
42+
// Import the createProvider function from the provider module
43+
const { createProvider } = await import('../../../core/llm/provider.js');
44+
45+
// Create a provider instance using the provider abstraction
46+
const llmProvider = createProvider(provider, model, {
47+
apiKey,
48+
baseUrl
49+
});
50+
51+
// Generate text using the provider
52+
const response = await llmProvider.generateText({
53+
messages: [
54+
{
55+
role: 'system',
56+
content: systemPrompt
57+
},
58+
{
59+
role: 'user',
60+
content: `URL: ${url}\n\nHTML content:\n${html}`
61+
}
62+
],
63+
temperature: 0.3,
64+
maxTokens: 4000
65+
});
66+
67+
// Extract the markdown content from the response
68+
const markdown = response.text;
69+
70+
if (!markdown) {
71+
context.logger.warn('LLM returned empty content, falling back to raw DOM');
72+
return getRawDOM(page);
73+
}
74+
75+
// Log token usage for monitoring
76+
context.logger.debug(`Token usage for content extraction: ${JSON.stringify(response.tokenUsage)}`);
77+
78+
return markdown;
79+
} catch (llmError) {
80+
context.logger.error('Error using LLM provider for content extraction:', llmError);
81+
return getRawDOM(page);
82+
}
4983
} catch (error) {
50-
console.error('Error using Readability:', error);
51-
// Fallback to simple mode if Readability fails
52-
return getSimpleProcessedDOM(page);
84+
context.logger.error('Error using LLM for content extraction:', error);
85+
// Fallback to raw mode if LLM processing fails
86+
return getRawDOM(page);
5387
}
5488
}
5589

56-
/**
57-
* Processes the page by removing invisible elements and non-visual tags
58-
*/
59-
async function getSimpleProcessedDOM(page: Page): Promise<string> {
60-
const domContent = await page.evaluate(() => {
61-
const clone = document.documentElement;
62-
63-
const elements = clone.querySelectorAll('*');
64-
65-
const elementsToRemove: Element[] = [];
66-
elements.forEach((element) => {
67-
const computedStyle = window.getComputedStyle(element);
68-
const isVisible =
69-
computedStyle.display !== 'none' &&
70-
computedStyle.visibility !== 'hidden' &&
71-
computedStyle.opacity !== '0';
72-
73-
if (!isVisible) {
74-
elementsToRemove.push(element);
75-
}
76-
});
77-
78-
const nonVisualTags = clone.querySelectorAll(
79-
'noscript, iframe, link[rel="stylesheet"], meta, svg, img, symbol, path, style, script',
80-
);
81-
nonVisualTags.forEach((element) => elementsToRemove.push(element));
82-
83-
elementsToRemove.forEach((element) => element.remove());
84-
85-
return clone.outerHTML;
86-
});
87-
88-
return domContent.replace(/\n/g, '').replace(/\s+/g, ' ');
89-
}
90-
9190
/**
9291
* Gets the rendered DOM of a page with specified processing method
9392
*/
9493
export async function filterPageContent(
9594
page: Page,
96-
pageFilter: 'simple' | 'none' | 'readability',
95+
pageFilter: 'raw' | 'smartMarkdown',
96+
context?: ToolContext
9797
): Promise<string> {
9898
let result: string = '';
99+
99100
switch (pageFilter) {
100-
case 'none':
101-
result = await getNoneProcessedDOM(page);
102-
break;
103-
case 'readability':
104-
result = await getReadabilityProcessedDOM(page);
101+
case 'smartMarkdown':
102+
if (!context) {
103+
console.warn('ToolContext required for smartMarkdown filter but not provided, falling back to raw mode');
104+
result = await getRawDOM(page);
105+
} else {
106+
result = await getSmartMarkdownContent(page, context);
107+
}
105108
break;
106-
case 'simple':
109+
case 'raw':
107110
default:
108-
result = await getSimpleProcessedDOM(page);
111+
result = await getRawDOM(page);
109112
break;
110113
}
111114

112-
if (result.length > OUTPUT_LIMIT) {
113-
return result.slice(0, OUTPUT_LIMIT) + '...(truncated)';
115+
// Ensure result is a string before checking length
116+
const resultString = result || '';
117+
if (resultString.length > OUTPUT_LIMIT) {
118+
return resultString.slice(0, OUTPUT_LIMIT) + '...(truncated)';
114119
}
115-
return result;
120+
return resultString;
116121
}

0 commit comments

Comments
 (0)