From 92a2c0b6c082d93dbfb14e6d243505946ed6a56c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:55:33 +0000 Subject: [PATCH 1/3] Update changelog (#856) Co-authored-by: alenakhineika <16307679+alenakhineika@users.noreply.github.com> --- CHANGELOG.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a2ee7379..b778c4f6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Change Log +## [v1.9.2](https://github.com/mongodb-js/vscode/releases/tag/v1.9.2) - 2024-10-21 + +## What's Changed +* fix(chat): show empty docs msg, schema set msg content correctly VSCODE-628 by @Anemy in https://github.com/mongodb-js/vscode/pull/851 +* chore(participant): move docs references after content VSCODE-629 by @Anemy in https://github.com/mongodb-js/vscode/pull/852 +* fix: use new connection form by @paula-stacho in https://github.com/mongodb-js/vscode/pull/815 +* chore(chat): update docs chatbot request headers VSCODE-634 by @Anemy in https://github.com/mongodb-js/vscode/pull/853 +* chore: use the latest vsce that adds the chat-participant tag VSCODE-638 by @alenakhineika in https://github.com/mongodb-js/vscode/pull/855 + + +**Full Changelog**: https://github.com/mongodb-js/vscode/compare/v1.9.1...v1.9.2 + + ## [v1.9.1](https://github.com/mongodb-js/vscode/releases/tag/v1.9.1) - 2024-09-30 ## What's Changed @@ -401,14 +414,3 @@ To dig deeper please feel free to follow the links mentioned below: - Fix launching mongodb shell with ssl in bash (VSCODE-227, #270) -## [v0.4.2](https://github.com/mongodb-js/vscode/releases/tag/v0.4.2) - 2021-02-17 - -### Added - -- Add icon to refresh collection documents list (#264) - -### Changed - -- Updated the Atlas link to have https (#259) - - From cace4df75f213c427109f70bab4d0e3280422631 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Thu, 24 Oct 2024 11:39:28 +0200 Subject: [PATCH 2/3] VSCODE-640: Adapt message content access to latest vscode API (#857) * Adapt message content access to latest vscode API * Address CR comments --- .vscode/launch.json | 14 ++ src/participant/participant.ts | 21 +-- src/participant/prompts/index.ts | 5 +- src/participant/prompts/promptBase.ts | 48 +++++- .../suite/participant/participant.test.ts | 138 ++++++++++++------ 5 files changed, 169 insertions(+), 57 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index e7f6ca212..9d596e5fd 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -40,6 +40,20 @@ "outFiles": [ "${workspaceFolder}/dist/**/*.js" ] + }, + { + "name": "Run Tests", + "type": "extensionHost", + "request": "launch", + "runtimeExecutable": "${execPath}", + "args": [ + "${workspaceFolder}/out/test/suite", // TODO: VSCODE-641 - remove suite + "--disable-extensions", + "--extensionDevelopmentPath=${workspaceFolder}", + "--extensionTestsPath=${workspaceFolder}/out/test/suite" + ], + "outFiles": ["${workspaceFolder}/out/**/*.js"], + "preLaunchTask": "npm: compile:extension", } ], "compounds": [ diff --git a/src/participant/participant.ts b/src/participant/participant.ts index 07c725ca0..5dbf97b31 100644 --- a/src/participant/participant.ts +++ b/src/participant/participant.ts @@ -10,7 +10,7 @@ import type { LoadedConnection } from '../storage/connectionStorage'; import EXTENSION_COMMANDS from '../commands'; import type { StorageController } from '../storage'; import { StorageVariables } from '../storage'; -import { Prompts } from './prompts'; +import { getContentLength, Prompts } from './prompts'; import type { ChatResult } from './constants'; import { askToConnectChatResult, @@ -189,7 +189,7 @@ export default class ParticipantController { (message: vscode.LanguageModelChatMessage) => util.inspect({ role: message.role, - contentLength: message.content.length, + contentLength: getContentLength(message), }) ), }); @@ -790,15 +790,18 @@ export default class ParticipantController { // it currently errors (not on insiders, only main VSCode). // Here we're defaulting to have some content as a workaround. // TODO: Remove this when the issue is fixed. - messagesWithNamespace.messages[ - messagesWithNamespace.messages.length - 1 - // eslint-disable-next-line new-cap - ] = vscode.LanguageModelChatMessage.User( + if ( + !Prompts.doMessagesContainUserInput([ + messagesWithNamespace.messages[ + messagesWithNamespace.messages.length - 1 + ], + ]) + ) { messagesWithNamespace.messages[ messagesWithNamespace.messages.length - 1 - ].content.trim() || 'see previous messages' - ); - + // eslint-disable-next-line new-cap + ] = vscode.LanguageModelChatMessage.User('see previous messages'); + } const responseContentWithNamespace = await this.getChatResponseContent({ modelInput: messagesWithNamespace, token, diff --git a/src/participant/prompts/index.ts b/src/participant/prompts/index.ts index 18e4150af..5324f2847 100644 --- a/src/participant/prompts/index.ts +++ b/src/participant/prompts/index.ts @@ -5,6 +5,9 @@ import { IntentPrompt } from './intent'; import { NamespacePrompt } from './namespace'; import { QueryPrompt } from './query'; import { SchemaPrompt } from './schema'; +import { isContentEmpty } from './promptBase'; + +export { getContentLength } from './promptBase'; export class Prompts { public static generic = new GenericPrompt(); @@ -26,7 +29,7 @@ export class Prompts { for (const message of messages) { if ( message.role === vscode.LanguageModelChatMessageRole.User && - message.content.trim().length > 0 + !isContentEmpty(message) ) { return true; } diff --git a/src/participant/prompts/promptBase.ts b/src/participant/prompts/promptBase.ts index 949b4f3d0..38a8ae672 100644 --- a/src/participant/prompts/promptBase.ts +++ b/src/participant/prompts/promptBase.ts @@ -24,6 +24,52 @@ export interface ModelInput { stats: ParticipantPromptProperties; } +export function getContentLength( + message: vscode.LanguageModelChatMessage +): number { + const content = message.content as any; + if (typeof content === 'string') { + return content.trim().length; + } + + // TODO: https://github.com/microsoft/vscode/pull/231788 made it so message.content is no longer a string, + // but an array of things that a message can contain. This will eventually be reflected in the type definitions + // but until then, we're manually checking the array contents to ensure we don't break when this PR gets released + // in the stable channel. + if (Array.isArray(content)) { + return content.reduce((acc: number, element) => { + const value = element?.value ?? element?.content?.value; + if (typeof value === 'string') { + return acc + value.length; + } + + return acc; + }, 0); + } + + return 0; +} + +export function isContentEmpty( + message: vscode.LanguageModelChatMessage +): boolean { + const content = message.content as any; + if (typeof content === 'string') { + return content.trim().length === 0; + } + + if (Array.isArray(content)) { + for (const element of content) { + const value = element?.value ?? element?.content?.value; + if (typeof value === 'string' && value.trim().length > 0) { + return false; + } + } + } + + return true; +} + export abstract class PromptBase { protected abstract getAssistantPrompt(args: TArgs): string; @@ -92,7 +138,7 @@ export abstract class PromptBase { ): ParticipantPromptProperties { return { total_message_length: messages.reduce( - (acc, message) => acc + message.content.length, + (acc, message) => acc + getContentLength(message), 0 ), user_input_length: request.prompt.length, diff --git a/src/test/suite/participant/participant.test.ts b/src/test/suite/participant/participant.test.ts index 38962a84b..094ff3681 100644 --- a/src/test/suite/participant/participant.test.ts +++ b/src/test/suite/participant/participant.test.ts @@ -31,6 +31,7 @@ import { ChatMetadataStore } from '../../../participant/chatMetadata'; import { Prompts } from '../../../participant/prompts'; import { createMarkdownLink } from '../../../participant/markdown'; import EXTENSION_COMMANDS from '../../../commands'; +import { getContentLength } from '../../../participant/prompts/promptBase'; // The Copilot's model in not available in tests, // therefore we need to mock its methods and returning values. @@ -50,6 +51,28 @@ const encodeStringify = (obj: Record): string => { return encodeURIComponent(JSON.stringify(obj)); }; +const getMessageContent = ( + message: vscode.LanguageModelChatMessage +): string => { + const content = message.content as any; + if (typeof content === 'string') { + return content; + } + + if (Array.isArray(content)) { + return content.reduce((agg: string, element) => { + const value = element?.value ?? element?.content?.value; + if (typeof value === 'string') { + return agg + value; + } + + return agg; + }, ''); + } + + return ''; +}; + suite('Participant Controller Test Suite', function () { const extensionContextStub = new ExtensionContextStub(); @@ -514,20 +537,22 @@ suite('Participant Controller Test Suite', function () { const res = await invokeChatHandler(chatRequestMock); expect(sendRequestStub).to.have.been.calledTwice; - const intentRequest = sendRequestStub.firstCall.args[0]; + const intentRequest = sendRequestStub.firstCall + .args[0] as vscode.LanguageModelChatMessage[]; expect(intentRequest).to.have.length(2); - expect(intentRequest[0].content).to.include( + expect(getMessageContent(intentRequest[0])).to.include( 'Your task is to help guide a conversation with a user to the correct handler.' ); - expect(intentRequest[1].content).to.equal( + expect(getMessageContent(intentRequest[1])).to.equal( 'what is the shape of the documents in the pineapple collection?' ); - const genericRequest = sendRequestStub.secondCall.args[0]; + const genericRequest = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; expect(genericRequest).to.have.length(2); - expect(genericRequest[0].content).to.include( + expect(getMessageContent(genericRequest[0])).to.include( 'Parse all user messages to find a database name and a collection name.' ); - expect(genericRequest[1].content).to.equal( + expect(getMessageContent(genericRequest[1])).to.equal( 'what is the shape of the documents in the pineapple collection?' ); @@ -544,20 +569,22 @@ suite('Participant Controller Test Suite', function () { const res = await invokeChatHandler(chatRequestMock); expect(sendRequestStub).to.have.been.calledTwice; - const intentRequest = sendRequestStub.firstCall.args[0]; + const intentRequest = sendRequestStub.firstCall + .args[0] as vscode.LanguageModelChatMessage[]; expect(intentRequest).to.have.length(2); - expect(intentRequest[0].content).to.include( + expect(getMessageContent(intentRequest[0])).to.include( 'Your task is to help guide a conversation with a user to the correct handler.' ); - expect(intentRequest[1].content).to.equal( + expect(getMessageContent(intentRequest[1])).to.equal( 'how to find documents in my collection?' ); - const genericRequest = sendRequestStub.secondCall.args[0]; + const genericRequest = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; expect(genericRequest).to.have.length(2); - expect(genericRequest[0].content).to.include( + expect(getMessageContent(genericRequest[0])).to.include( 'Your task is to help the user with MongoDB related questions.' ); - expect(genericRequest[1].content).to.equal( + expect(getMessageContent(genericRequest[1])).to.equal( 'how to find documents in my collection?' ); @@ -648,8 +675,9 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.include( + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.include( 'Collection schema: _id: ObjectId\n' + 'field.stringField: String\n' + 'field.arrayField: Array\n' @@ -712,8 +740,9 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.include( + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.include( 'Sample documents: [\n' + ' {\n' + " _id: ObjectId('63ed1d522d8573fa5c203661'),\n" + @@ -781,8 +810,9 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.include( + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.include( 'Sample document: {\n' + " _id: ObjectId('63ed1d522d8573fa5c203660'),\n" + ' field: {\n' + @@ -844,8 +874,9 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.include( + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.include( 'Sample document: {\n' + " _id: ObjectId('63ed1d522d8573fa5c203661'),\n" + ' field: {\n' + @@ -904,8 +935,11 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.not.include('Sample documents'); + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.not.include( + 'Sample documents' + ); assertCommandTelemetry('query', chatRequestMock, { callIndex: 0, @@ -932,8 +966,11 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[1].content).to.not.include('Sample documents'); + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[1])).to.not.include( + 'Sample documents' + ); assertCommandTelemetry('query', chatRequestMock, { callIndex: 0, @@ -1374,7 +1411,9 @@ suite('Participant Controller Test Suite', function () { await invokeChatHandler(chatRequestMock); expect(sendRequestStub.calledOnce).to.be.true; - expect(sendRequestStub.firstCall.args[0][0].content).to.include( + const messages = sendRequestStub.firstCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[0])).to.include( 'Parse all user messages to find a database name and a collection name.' ); @@ -1422,10 +1461,13 @@ suite('Participant Controller Test Suite', function () { await invokeChatHandler(chatRequestMock); expect(sendRequestStub.calledOnce).to.be.true; - expect(sendRequestStub.firstCall.args[0][0].content).to.include( + + const messages = sendRequestStub.firstCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[0])).to.include( 'Parse all user messages to find a database name and a collection name.' ); - expect(sendRequestStub.firstCall.args[0][3].content).to.include( + expect(getMessageContent(messages[3])).to.include( 'see previous messages' ); }); @@ -1528,11 +1570,12 @@ suite('Participant Controller Test Suite', function () { references: [], }; await invokeChatHandler(chatRequestMock); - const messages = sendRequestStub.secondCall.args[0]; - expect(messages[0].content).to.include( + const messages = sendRequestStub.secondCall + .args[0] as vscode.LanguageModelChatMessage[]; + expect(getMessageContent(messages[0])).to.include( 'Amount of documents sampled: 2' ); - expect(messages[1].content).to.include( + expect(getMessageContent(messages[1])).to.include( `Database name: dbOne Collection name: collOne Schema: @@ -1540,7 +1583,8 @@ Schema: "count": 2, "fields": [` ); - expect(messages[1].content).to.include(`"name": "arrayField", + expect(getMessageContent(messages[1])).to + .include(`"name": "arrayField", "path": [ "field", "arrayField" @@ -1703,7 +1747,7 @@ Schema: expect(stats.has_sample_documents).to.be.false; expect(stats.user_input_length).to.equal(chatRequestMock.prompt.length); expect(stats.total_message_length).to.equal( - messages[0].content.length + messages[1].content.length + getContentLength(messages[0]) + getContentLength(messages[1]) ); }); @@ -1759,7 +1803,7 @@ Schema: expect(messages[1].role).to.equal( vscode.LanguageModelChatMessageRole.User ); - expect(messages[1].content).to.equal( + expect(getMessageContent(messages[1])).to.equal( 'give me the count of all people in the prod database' ); @@ -1772,15 +1816,15 @@ Schema: expect(stats.has_sample_documents).to.be.true; expect(stats.user_input_length).to.equal(chatRequestMock.prompt.length); expect(stats.total_message_length).to.equal( - messages[0].content.length + - messages[1].content.length + - messages[2].content.length + getContentLength(messages[0]) + + getContentLength(messages[1]) + + getContentLength(messages[2]) ); // The length of the user prompt length should be taken from the prompt supplied // by the user, even if we enhance it with sample docs and schema. expect(stats.user_input_length).to.be.lessThan( - messages[2].content.length + getContentLength(messages[2]) ); }); @@ -1812,20 +1856,22 @@ Schema: expect(messages[0].role).to.equal( vscode.LanguageModelChatMessageRole.Assistant ); - expect(messages[0].content).to.include('Amount of documents sampled: 3'); + expect(getMessageContent(messages[0])).to.include( + 'Amount of documents sampled: 3' + ); expect(messages[1].role).to.equal( vscode.LanguageModelChatMessageRole.User ); - expect(messages[1].content).to.include(databaseName); - expect(messages[1].content).to.include(collectionName); - expect(messages[1].content).to.include(schema); + expect(getMessageContent(messages[1])).to.include(databaseName); + expect(getMessageContent(messages[1])).to.include(collectionName); + expect(getMessageContent(messages[1])).to.include(schema); expect(stats.command).to.equal('schema'); expect(stats.has_sample_documents).to.be.false; expect(stats.user_input_length).to.equal(chatRequestMock.prompt.length); expect(stats.total_message_length).to.equal( - messages[0].content.length + messages[1].content.length + getContentLength(messages[0]) + getContentLength(messages[1]) ); }); @@ -1852,7 +1898,7 @@ Schema: expect(stats.has_sample_documents).to.be.false; expect(stats.user_input_length).to.equal(chatRequestMock.prompt.length); expect(stats.total_message_length).to.equal( - messages[0].content.length + messages[1].content.length + getContentLength(messages[0]) + getContentLength(messages[1]) ); }); @@ -1927,18 +1973,18 @@ Schema: expect(messages[1].role).to.equal( vscode.LanguageModelChatMessageRole.User ); - expect(messages[1].content).to.contain(expectedPrompt); + expect(getMessageContent(messages[1])).to.contain(expectedPrompt); expect(stats.command).to.equal('query'); expect(stats.has_sample_documents).to.be.false; expect(stats.user_input_length).to.equal(expectedPrompt.length); expect(stats.total_message_length).to.equal( - messages[0].content.length + messages[1].content.length + getContentLength(messages[0]) + getContentLength(messages[1]) ); // The prompt builder may add extra info, but we're only reporting the actual user input expect(stats.user_input_length).to.be.lessThan( - messages[1].content.length + getContentLength(messages[1]) ); }); }); From 10be1c6049a4fe7644905fa2f83fb00c8ce7fcaf Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Thu, 24 Oct 2024 11:52:30 +0200 Subject: [PATCH 3/3] VSCODE-639: Replace regex fragment matching with streaming KMP (#837) * Replace regex fragment matching with streaming KMP * Simplify content capturing * Add an explanation for FragmentMatcher * Handle multiple code blocks per fragment --- src/participant/streamParsing.ts | 240 +++++++++++++----- .../suite/participant/streamParsing.test.ts | 76 ++++++ 2 files changed, 246 insertions(+), 70 deletions(-) diff --git a/src/participant/streamParsing.ts b/src/participant/streamParsing.ts index 93bb5dad9..d8131a759 100644 --- a/src/participant/streamParsing.ts +++ b/src/participant/streamParsing.ts @@ -1,5 +1,166 @@ -function escapeRegex(str: string): string { - return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +// This is a stateful streaming implementation of the Knuth-Morris-Pratt algorithm +// for substring search. It supports being invoked with multiple fragments of the +// haystack and is capable of finding matches spanning multiple fragments. +class StreamingKMP { + public needle: string; + private _lookupVector: number[]; + + // In cases where we are fed a string that has a suffix that matches a prefix + // of the needle, we're storing the index in the needle which we last matched. + // Then when we get a new haystack, we start matching from that needle. + private _lastMatchingIndex = 0; + + constructor(needle: string) { + this.needle = needle; + this._lookupVector = this._createLookupVector(); + } + + private _createLookupVector(): number[] { + const vector = new Array(this.needle.length); + let j = 0; + vector[0] = 0; + + for (let i = 1; i < this.needle.length; i++) { + while (j > 0 && this.needle[i] !== this.needle[j]) { + j = vector[j - 1]; + } + + if (this.needle[i] === this.needle[j]) { + j++; + } + + vector[i] = j; + } + + return vector; + } + + // Returns the index in the haystackFragment **after** the needle. + // This is done because the match may have occurred over multiple fragments, + // so the index of the needle start would be negative. + public match(haystackFragment: string): number { + let j = this._lastMatchingIndex; // index in needle + let i = 0; // index in haystack + + while (i < haystackFragment.length) { + if (haystackFragment[i] === this.needle[j]) { + i++; + j++; + } + + if (j === this.needle.length) { + this._lastMatchingIndex = 0; + return i; + } + + if ( + i < haystackFragment.length && + haystackFragment[i] !== this.needle[j] + ) { + if (j !== 0) { + j = this._lookupVector[j - 1]; + } else { + i++; + } + } + } + + this._lastMatchingIndex = j; + return -1; + } + + public reset(): void { + this._lastMatchingIndex = 0; + } +} + +// This class is essentially a state machine that processes a stream of text fragments +// and emitting a callback with the content between each start and end identifier. The +// two states we have are: +// 1. "waiting for start identifier" - `_matchedContent === undefined` +// 2. "waiting for end identifier" - `_matchedContent !== undefined` +// with the state transitioning from one to the other when the corresponding identifier +// is matched in the fragment stream. +class FragmentMatcher { + private _startMatcher: StreamingKMP; + private _endMatcher: StreamingKMP; + private _matchedContent?: string; + private _onContentMatched: (content: string) => void; + private _onFragmentProcessed: (content: string) => void; + + constructor({ + identifier, + onContentMatched, + onFragmentProcessed, + }: { + identifier: { + start: string; + end: string; + }; + onContentMatched: (content: string) => void; + onFragmentProcessed: (content: string) => void; + }) { + this._startMatcher = new StreamingKMP(identifier.start); + this._endMatcher = new StreamingKMP(identifier.end); + this._onContentMatched = onContentMatched; + this._onFragmentProcessed = onFragmentProcessed; + } + + private _contentMatched(): void { + const content = this._matchedContent; + if (content !== undefined) { + // Strip the trailing end identifier from the matched content + this._onContentMatched( + content.slice(0, content.length - this._endMatcher.needle.length) + ); + } + + this._matchedContent = undefined; + this._startMatcher.reset(); + this._endMatcher.reset(); + } + + // This needs to be invoked every time before we call `process` recursively or when `process` + // completes processing the fragment. It will emit a notification to subscribers with the partial + // fragment we've processed, regardless of whether there's a match or not. + private _partialFragmentProcessed( + fragment: string, + index: number | undefined = undefined + ): void { + this._onFragmentProcessed( + index === undefined ? fragment : fragment.slice(0, index) + ); + } + + public process(fragment: string): void { + if (this._matchedContent === undefined) { + // We haven't matched the start identifier yet, so try and do that + const startIndex = this._startMatcher.match(fragment); + if (startIndex !== -1) { + // We found a match for the start identifier - update `_matchedContent` to an empty string + // and recursively call `process` with the remainder of the fragment. + this._matchedContent = ''; + this._partialFragmentProcessed(fragment, startIndex); + this.process(fragment.slice(startIndex)); + } else { + this._partialFragmentProcessed(fragment); + } + } else { + const endIndex = this._endMatcher.match(fragment); + if (endIndex !== -1) { + // We've matched the end - emit the matched content and continue processing the partial fragment + this._matchedContent += fragment.slice(0, endIndex); + this._partialFragmentProcessed(fragment, endIndex); + this._contentMatched(); + this.process(fragment.slice(endIndex)); + } else { + // We haven't matched the end yet - append the fragment to the matched content and wait + // for a future fragment to contain the end identifier. + this._matchedContent += fragment; + this._partialFragmentProcessed(fragment); + } + } + } } /** @@ -22,74 +183,13 @@ export async function processStreamWithIdentifiers({ end: string; }; }): Promise { - const escapedIdentifierStart = escapeRegex(identifier.start); - const escapedIdentifierEnd = escapeRegex(identifier.end); - const regex = new RegExp( - `${escapedIdentifierStart}([\\s\\S]*?)${escapedIdentifierEnd}`, - 'g' - ); - - let contentSinceLastIdentifier = ''; - for await (const fragment of inputIterable) { - contentSinceLastIdentifier += fragment; + const fragmentMatcher = new FragmentMatcher({ + identifier, + onContentMatched: onStreamIdentifier, + onFragmentProcessed: processStreamFragment, + }); - let lastIndex = 0; - let match: RegExpExecArray | null; - while ((match = regex.exec(contentSinceLastIdentifier)) !== null) { - const endIndex = regex.lastIndex; - - // Stream content up to the end of the identifier. - const contentToStream = contentSinceLastIdentifier.slice( - lastIndex, - endIndex - ); - processStreamFragment(contentToStream); - - const identifierContent = match[1]; - onStreamIdentifier(identifierContent); - - lastIndex = endIndex; - } - - if (lastIndex > 0) { - // Remove all of the processed content. - contentSinceLastIdentifier = contentSinceLastIdentifier.slice(lastIndex); - // Reset the regex. - regex.lastIndex = 0; - } else { - // Clear as much of the content as we can safely. - const maxUnprocessedLength = identifier.start.length - 1; - if (contentSinceLastIdentifier.length > maxUnprocessedLength) { - const identifierIndex = contentSinceLastIdentifier.indexOf( - identifier.start - ); - if (identifierIndex > -1) { - // We have an identifier, so clear up until the identifier. - const contentToStream = contentSinceLastIdentifier.slice( - 0, - identifierIndex - ); - processStreamFragment(contentToStream); - contentSinceLastIdentifier = - contentSinceLastIdentifier.slice(identifierIndex); - } else { - // No identifier, so clear up until the last maxUnprocessedLength. - const processUpTo = - contentSinceLastIdentifier.length - maxUnprocessedLength; - const contentToStream = contentSinceLastIdentifier.slice( - 0, - processUpTo - ); - processStreamFragment(contentToStream); - contentSinceLastIdentifier = - contentSinceLastIdentifier.slice(processUpTo); - } - } - } - } - - // Finish up anything not streamed yet. - if (contentSinceLastIdentifier.length > 0) { - processStreamFragment(contentSinceLastIdentifier); + for await (const fragment of inputIterable) { + fragmentMatcher.process(fragment); } } diff --git a/src/test/suite/participant/streamParsing.test.ts b/src/test/suite/participant/streamParsing.test.ts index 66208ecdd..96d274d00 100644 --- a/src/test/suite/participant/streamParsing.test.ts +++ b/src/test/suite/participant/streamParsing.test.ts @@ -216,4 +216,80 @@ suite('processStreamWithIdentifiers', () => { expect(fragmentsProcessed.join('')).to.deep.equal(inputFragments.join('')); expect(identifiersStreamed).to.deep.equal(['\ncode1\n', '\ncode2\n']); }); + + test('one fragment containing multiple code blocks emits event in correct order', async () => { + // In case we have one fragment containing multiple code blocks, we want to make sure that + // fragment notifications and identifier notifications arrive in the right order so that we're + // adding code actions after the correct subfragment. + // For example: + // 'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.' + // + // should emit: + // + // processStreamFragment: 'Text before code.\n```js\ncode1\n```' + // onStreamIdentifier: '\ncode1\n' + // processStreamFragment: '\nText between code.\n```js\ncode2\n```' + // onStreamIdentifier: '\ncode2\n' + // processStreamFragment: '\nText after code.' + // + // in that order to ensure we add each code action immediately after the code block + // rather than add both at the end. + + const inputFragments = [ + 'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.', + ]; + + const inputIterable = asyncIterableFromArray(inputFragments); + const identifier = { start: '```js', end: '```' }; + + const fragmentsEmitted: { + source: 'processStreamFragment' | 'onStreamIdentifier'; + content: string; + }[] = []; + + const getFragmentHandler = ( + source: 'processStreamFragment' | 'onStreamIdentifier' + ): ((fragment: string) => void) => { + return (fragment: string): void => { + // It's an implementation detail, but the way the code is structured today, we're splitting the emitted fragments + // whenever we find either a start or end identifier. This is irrelevant as long as we're emitting the entirety of + // the text until the end of the code block in `processStreamFragment` and then the code block itself in `onStreamIdentifier`. + // With the code below, we're combining all subfragments with the same source to make the test verify the desired + // behavior rather than the actual implementation. + const lastFragment = fragmentsEmitted[fragmentsEmitted.length - 1]; + if (lastFragment?.source === source) { + lastFragment.content += fragment; + } else { + fragmentsEmitted.push({ source, content: fragment }); + } + }; + }; + + await processStreamWithIdentifiers({ + processStreamFragment: getFragmentHandler('processStreamFragment'), + onStreamIdentifier: getFragmentHandler('onStreamIdentifier'), + inputIterable, + identifier, + }); + + expect(fragmentsEmitted).to.have.length(5); + expect(fragmentsEmitted[0].source).to.equal('processStreamFragment'); + expect(fragmentsEmitted[0].content).to.equal( + 'Text before code.\n```js\ncode1\n```' + ); + + expect(fragmentsEmitted[1].source).to.equal('onStreamIdentifier'); + expect(fragmentsEmitted[1].content).to.equal('\ncode1\n'); + + expect(fragmentsEmitted[2].source).to.equal('processStreamFragment'); + expect(fragmentsEmitted[2].content).to.equal( + '\nText between code.\n```js\ncode2\n```' + ); + + expect(fragmentsEmitted[3].source).to.equal('onStreamIdentifier'); + expect(fragmentsEmitted[3].content).to.equal('\ncode2\n'); + + expect(fragmentsEmitted[4].source).to.equal('processStreamFragment'); + expect(fragmentsEmitted[4].content).to.equal('\nText after code.'); + }); });