Skip to content

Commit 55dab9c

Browse files
authored
FIX: stabilize diff algorithm for streaming (#1358)
Previous to this fix we would diff the entire body of text, this could lead to situations where a diff presented to a user was wildly off matching areas in the text that should not have been tested. New algorithm only checks a portion of the string, this ensures that during streaming there is no chance for wild mistakes
1 parent c29183f commit 55dab9c

File tree

2 files changed

+117
-1
lines changed

2 files changed

+117
-1
lines changed

assets/javascripts/discourse/lib/diff-streamer.gjs

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import loadJSDiff from "discourse/lib/load-js-diff";
44
import { parseAsync } from "discourse/lib/text";
55

66
const DEFAULT_CHAR_TYPING_DELAY = 30;
7+
const STREAMING_DIFF_TRUNCATE_THRESHOLD = 0.1;
8+
const STREAMING_DIFF_TRUNCATE_BUFFER = 10;
79

810
export default class DiffStreamer {
911
@tracked isStreaming = false;
@@ -122,6 +124,53 @@ export default class DiffStreamer {
122124
return maybeUnfinishedImage || maybeUnfinishedLink;
123125
}
124126

127+
// this is public to make testing easier
128+
// is makes it easier to do a "streaming diff" where we want to ensure diff
129+
// is focused on the beginning of the text instead of taking the entire body
130+
// into account.
131+
// This ensures that we do not make mistakes and present wildly different diffs
132+
// to what we would stablize on at the end of the stream.
133+
streamingDiff(original, suggestion) {
134+
const maxDiffLength = Math.floor(
135+
suggestion.length +
136+
suggestion.length * STREAMING_DIFF_TRUNCATE_THRESHOLD +
137+
STREAMING_DIFF_TRUNCATE_BUFFER
138+
);
139+
const head = original.slice(0, maxDiffLength);
140+
const tail = original.slice(maxDiffLength);
141+
142+
const diffArray = this.jsDiff.diffWordsWithSpace(head, suggestion);
143+
144+
if (tail.length > 0) {
145+
// if last in the array is added, and previous is removed then flip them
146+
let last = diffArray[diffArray.length - 1];
147+
let secondLast = diffArray[diffArray.length - 2];
148+
149+
if (last.added && secondLast.removed) {
150+
diffArray.pop();
151+
diffArray.pop();
152+
diffArray.push(last);
153+
diffArray.push(secondLast);
154+
155+
last = secondLast;
156+
secondLast = diffArray[diffArray.length - 2];
157+
}
158+
159+
if (!last.removed) {
160+
last = {
161+
added: false,
162+
removed: true,
163+
value: "",
164+
};
165+
diffArray.push(last);
166+
}
167+
168+
last.value = last.value + tail;
169+
}
170+
171+
return diffArray;
172+
}
173+
125174
async #streamNextChar() {
126175
if (this.currentWordIndex < this.words.length) {
127176
const currentToken = this.words[this.currentWordIndex];
@@ -134,7 +183,7 @@ export default class DiffStreamer {
134183
this.currentWordIndex++;
135184
this.currentCharIndex = 0;
136185

137-
const originalDiff = this.jsDiff.diffWordsWithSpace(
186+
const originalDiff = this.streamingDiff(
138187
this.selectedText,
139188
this.suggestion
140189
);
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { module, test } from "qunit";
2+
import DiffStreamer from "discourse/plugins/discourse-ai/discourse/lib/diff-streamer";
3+
4+
module("Unit | Lib | diff-streamer", function () {
5+
test("streamingDiff correctly handles trivial cases", async function (assert) {
6+
const originalText = "helo world";
7+
const targetText = "hello world";
8+
9+
const diffStreamer = new DiffStreamer(this.originalTextContent);
10+
await diffStreamer.loadJSDiff();
11+
12+
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
13+
14+
const expectedDiff = [
15+
{ count: 1, added: false, removed: true, value: "helo" },
16+
{ count: 1, added: true, removed: false, value: "hello" },
17+
{ count: 2, added: false, removed: false, value: " world" },
18+
];
19+
20+
assert.deepEqual(
21+
diffResult,
22+
expectedDiff,
23+
"Diff result should match the expected structure"
24+
);
25+
});
26+
27+
test("streamingDiff correctly consolidates and handles diff drift", async function (assert) {
28+
const originalText =
29+
"This is todone, but I want to can why.\n\nWe\n\nSEO Tags supports a `canonical_url` override. I tried a few possibilities there. The one I wanted to work, ex: `https://www.discourse.org/de`, appended an extra `/de` on the URL, only in the deploy b";
30+
const targetText = "This is to-done";
31+
32+
const diffStreamer = new DiffStreamer(this.originalTextContent);
33+
await diffStreamer.loadJSDiff();
34+
35+
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
36+
37+
// Verify the diff result is an array with the expected structure
38+
assert.true(Array.isArray(diffResult), "Diff result should be an array");
39+
assert.strictEqual(
40+
diffResult.length,
41+
3,
42+
"Expecting exactly three parts in the diff result"
43+
);
44+
45+
assert.strictEqual(
46+
diffResult[0].value,
47+
"This is ",
48+
"First part should be unchanged"
49+
);
50+
51+
assert.strictEqual(
52+
diffResult[1].value,
53+
"to-done",
54+
"Second part should be an insertion"
55+
);
56+
57+
assert.true(diffResult[1].added, "Second part should be an insertion");
58+
59+
assert.strictEqual(
60+
diffResult[2].value.length,
61+
235,
62+
"Third part should include all text"
63+
);
64+
65+
assert.true(diffResult[2].removed, "Third part should be a removal");
66+
});
67+
});

0 commit comments

Comments
 (0)