Skip to content

FIX: stabilize diff algorithm for streaming #1358

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion assets/javascripts/discourse/lib/diff-streamer.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import loadJSDiff from "discourse/lib/load-js-diff";
import { parseAsync } from "discourse/lib/text";

const DEFAULT_CHAR_TYPING_DELAY = 30;
const STREAMING_DIFF_TRUNCATE_THRESHOLD = 0.1;
const STREAMING_DIFF_TRUNCATE_BUFFER = 10;

export default class DiffStreamer {
@tracked isStreaming = false;
Expand Down Expand Up @@ -122,6 +124,53 @@ export default class DiffStreamer {
return maybeUnfinishedImage || maybeUnfinishedLink;
}

// this is public to make testing easier
// is makes it easier to do a "streaming diff" where we want to ensure diff
// is focused on the beginning of the text instead of taking the entire body
// into account.
// This ensures that we do not make mistakes and present wildly different diffs
// to what we would stablize on at the end of the stream.
streamingDiff(original, suggestion) {
const maxDiffLength = Math.floor(
suggestion.length +
suggestion.length * STREAMING_DIFF_TRUNCATE_THRESHOLD +
STREAMING_DIFF_TRUNCATE_BUFFER
);
const head = original.slice(0, maxDiffLength);
const tail = original.slice(maxDiffLength);

const diffArray = this.jsDiff.diffWordsWithSpace(head, suggestion);

if (tail.length > 0) {
// if last in the array is added, and previous is removed then flip them
let last = diffArray[diffArray.length - 1];
let secondLast = diffArray[diffArray.length - 2];

if (last.added && secondLast.removed) {
diffArray.pop();
diffArray.pop();
diffArray.push(last);
diffArray.push(secondLast);

last = secondLast;
secondLast = diffArray[diffArray.length - 2];
}

if (!last.removed) {
last = {
added: false,
removed: true,
value: "",
};
diffArray.push(last);
}

last.value = last.value + tail;
}

return diffArray;
}

async #streamNextChar() {
if (this.currentWordIndex < this.words.length) {
const currentToken = this.words[this.currentWordIndex];
Expand All @@ -134,7 +183,7 @@ export default class DiffStreamer {
this.currentWordIndex++;
this.currentCharIndex = 0;

const originalDiff = this.jsDiff.diffWordsWithSpace(
const originalDiff = this.streamingDiff(
this.selectedText,
this.suggestion
);
Expand Down
67 changes: 67 additions & 0 deletions test/javascripts/unit/lib/diff-streamer-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { module, test } from "qunit";
import DiffStreamer from "discourse/plugins/discourse-ai/discourse/lib/diff-streamer";

module("Unit | Lib | diff-streamer", function () {
test("streamingDiff correctly handles trivial cases", async function (assert) {
const originalText = "helo world";
const targetText = "hello world";

const diffStreamer = new DiffStreamer(this.originalTextContent);
await diffStreamer.loadJSDiff();

const diffResult = diffStreamer.streamingDiff(originalText, targetText);

const expectedDiff = [
{ count: 1, added: false, removed: true, value: "helo" },
{ count: 1, added: true, removed: false, value: "hello" },
{ count: 2, added: false, removed: false, value: " world" },
];

assert.deepEqual(
diffResult,
expectedDiff,
"Diff result should match the expected structure"
);
});

test("streamingDiff correctly consolidates and handles diff drift", async function (assert) {
const originalText =
"This is todone, but I want to can why.\n\nWe\n\nSEO Tags supports a `canonical_url` override. I tried a few possibilities there. The one I wanted to work, ex: `https://www.discourse.org/de`, appended an extra `/de` on the URL, only in the deploy b";
const targetText = "This is to-done";

const diffStreamer = new DiffStreamer(this.originalTextContent);
await diffStreamer.loadJSDiff();

const diffResult = diffStreamer.streamingDiff(originalText, targetText);

// Verify the diff result is an array with the expected structure
assert.true(Array.isArray(diffResult), "Diff result should be an array");
assert.strictEqual(
diffResult.length,
3,
"Expecting exactly three parts in the diff result"
);

assert.strictEqual(
diffResult[0].value,
"This is ",
"First part should be unchanged"
);

assert.strictEqual(
diffResult[1].value,
"to-done",
"Second part should be an insertion"
);

assert.true(diffResult[1].added, "Second part should be an insertion");

assert.strictEqual(
diffResult[2].value.length,
235,
"Third part should include all text"
);

assert.true(diffResult[2].removed, "Third part should be a removal");
});
});
Loading