mongodb-js
diff --git a/‎package-lock.json
Lines changed: 506 additions & 4 deletions b/‎package-lock.json
Lines changed: 506 additions & 4 deletions
diff --git a/‎package.json
Lines changed: 6 additions & 0 deletions b/‎package.json
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/accuracy/list-databases.test.ts
Lines changed: 26 additions & 0 deletions b/‎tests/accuracy/list-databases.test.ts
Lines changed: 26 additions & 0 deletions
diff --git a/‎tests/accuracy/sdk/accuracy-scorers.ts
Lines changed: 125 additions & 0 deletions b/‎tests/accuracy/sdk/accuracy-scorers.ts
Lines changed: 125 additions & 0 deletions
diff --git a/‎tests/accuracy/sdk/describe-accuracy-tests.ts
Lines changed: 51 additions & 0 deletions b/‎tests/accuracy/sdk/describe-accuracy-tests.ts
Lines changed: 51 additions & 0 deletions
diff --git a/‎tests/accuracy/sdk/models.ts
Lines changed: 62 additions & 0 deletions b/‎tests/accuracy/sdk/models.ts
Lines changed: 62 additions & 0 deletions
@@ -35,6 +35,10 @@
   "devDependencies": {
     "@eslint/js": "^9.24.0",
     "@jest/globals": "^30.0.0",
+    "@langchain/core": "^0.3.61",
+    "@langchain/google-genai": "^0.2.14",
+    "@langchain/ollama": "^0.2.3",
+    "@langchain/openai": "^0.5.16",
     "@modelcontextprotocol/inspector": "^0.14.0",
     "@redocly/cli": "^1.34.2",
     "@types/jest": "^29.5.14",
@@ -49,6 +53,7 @@
     "jest": "^29.7.0",
     "jest-environment-node": "^29.7.0",
     "jest-extended": "^6.0.0",
+    "langchain": "^0.3.29",
     "mongodb-runner": "^5.8.2",
     "openapi-types": "^12.1.3",
     "openapi-typescript": "^7.6.1",
@@ -57,6 +62,7 @@
     "tsx": "^4.19.3",
     "typescript": "^5.8.2",
     "typescript-eslint": "^8.29.1",
+    "uuid": "^11.1.0",
     "yaml": "^2.7.1"
   },
   "dependencies": {
 
@@ -0,0 +1,26 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+
+describeAccuracyTests("list-databases", getAvailableModels(), [
+    {
+        prompt: "Assume that you're already connected. How many collections are there in sample_mflix database",
+        mockedTools: {
+            "list-collections": function listCollections() {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: "Name: coll1",
+                        },
+                    ],
+                };
+            },
+        },
+        expectedToolCalls: [
+            {
+                toolName: "list-collections",
+                parameters: { database: "sample_mflix" },
+            },
+        ],
+    },
+]);
@@ -0,0 +1,125 @@
+export type ToolCall = {
+    toolCallId: string;
+    toolName: string;
+    parameters: unknown;
+};
+export type ExpectedToolCall = Omit<ToolCall, "toolCallId">;
+
+export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number {
+    if (actualToolCalls.length < expectedToolCalls.length) {
+        return 0;
+    }
+
+    const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1;
+    const checkedToolCallIds = new Set<string>();
+    for (const expectedToolCall of expectedToolCalls) {
+        const matchingActualToolCall = actualToolCalls.find(
+            (actualToolCall) =>
+                actualToolCall.toolName === expectedToolCall.toolName &&
+                !checkedToolCallIds.has(actualToolCall.toolCallId)
+        );
+
+        if (!matchingActualToolCall) {
+            return 0;
+        }
+
+        checkedToolCallIds.add(matchingActualToolCall.toolCallId);
+    }
+
+    return possibleScore;
+}
+
+export function parameterMatchingAccuracyScorer(
+    expectedToolCalls: ExpectedToolCall[],
+    actualToolCalls: ToolCall[]
+): number {
+    if (expectedToolCalls.length === 0) {
+        return 1;
+    }
+
+    const toolCallScores: number[] = [];
+    const checkedToolCallIds = new Set<string>();
+
+    for (const expectedToolCall of expectedToolCalls) {
+        const matchingActualToolCall = actualToolCalls.find(
+            (actualToolCall) =>
+                actualToolCall.toolName === expectedToolCall.toolName &&
+                !checkedToolCallIds.has(actualToolCall.toolCallId)
+        );
+
+        if (!matchingActualToolCall) {
+            toolCallScores.push(0);
+            continue;
+        }
+
+        checkedToolCallIds.add(matchingActualToolCall.toolCallId);
+        const score = compareParams(expectedToolCall.parameters, matchingActualToolCall.parameters);
+        toolCallScores.push(score);
+    }
+
+    const totalScore = toolCallScores.reduce((sum, score) => sum + score, 0);
+    return totalScore / toolCallScores.length;
+}
+
+/**
+ * Recursively compares expected and actual parameters and returns a score.
+ * - 1: Perfect match.
+ * - 0.75: All expected parameters are present and match, but there are extra actual parameters.
+ * - 0: Missing parameters or mismatched values.
+ */
+function compareParams(expected: unknown, actual: unknown): number {
+    if (expected === null || expected === undefined) {
+        return actual === null || actual === undefined ? 1 : 0;
+    }
+    if (actual === null || actual === undefined) {
+        return 0;
+    }
+
+    if (Array.isArray(expected)) {
+        if (!Array.isArray(actual) || actual.length < expected.length) {
+            return 0;
+        }
+        let minScore = 1;
+        for (let i = 0; i < expected.length; i++) {
+            minScore = Math.min(minScore, compareParams(expected[i], actual[i]));
+        }
+        if (minScore === 0) {
+            return 0;
+        }
+        if (actual.length > expected.length) {
+            minScore = Math.min(minScore, 0.75);
+        }
+        return minScore;
+    }
+
+    if (typeof expected === "object") {
+        if (typeof actual !== "object" || Array.isArray(actual)) {
+            return 0;
+        }
+        const expectedKeys = Object.keys(expected as Record<string, unknown>);
+        const actualKeys = Object.keys(actual as Record<string, unknown>);
+
+        let minScore = 1;
+        for (const key of expectedKeys) {
+            if (!Object.prototype.hasOwnProperty.call(actual, key)) {
+                return 0;
+            }
+            minScore = Math.min(
+                minScore,
+                compareParams((expected as Record<string, unknown>)[key], (actual as Record<string, unknown>)[key])
+            );
+        }
+
+        if (minScore === 0) {
+            return 0;
+        }
+
+        if (actualKeys.length > expectedKeys.length) {
+            minScore = Math.min(minScore, 0.75);
+        }
+        return minScore;
+    }
+
+    // eslint-disable-next-line eqeqeq
+    return expected == actual ? 1 : 0;
+}
@@ -0,0 +1,51 @@
+import { AgentExecutor } from "langchain/agents";
+import { Tool } from "@modelcontextprotocol/sdk/types.js";
+import { discoverMongoDBTools, TestTools, ToolResultGenerators } from "./test-tools.js";
+import { TestableModels } from "./models.js";
+import { getToolCallingAgent } from "./tool-calling-agent.js";
+import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js";
+
+interface AccuracyTestConfig {
+    prompt: string;
+    expectedToolCalls: ExpectedToolCall[];
+    mockedTools: ToolResultGenerators;
+}
+
+export function describeAccuracyTests(
+    suiteName: string,
+    models: TestableModels,
+    accuracyTestConfigs: AccuracyTestConfig[]
+) {
+    const eachModel = describe.each(models);
+    const eachTest = it.each(accuracyTestConfigs);
+
+    eachModel(`$modelName - ${suiteName}`, function (model) {
+        let mcpTools: Tool[];
+        let testTools: TestTools;
+        let agent: AgentExecutor;
+
+        beforeAll(async () => {
+            mcpTools = await discoverMongoDBTools();
+        });
+
+        beforeEach(() => {
+            testTools = new TestTools(mcpTools);
+            const transformToolResult = model.transformToolResult.bind(model);
+            agent = getToolCallingAgent(model, testTools.langChainTools(transformToolResult));
+        });
+
+        eachTest("$prompt", async function (testConfig) {
+            testTools.mockTools(testConfig.mockedTools);
+            const conversation = await agent.invoke({ input: testConfig.prompt });
+            console.log("conversation", conversation);
+            const toolCalls = testTools.getToolCalls();
+            console.log("?????? toolCalls", toolCalls);
+            console.log("???? expected", testConfig.expectedToolCalls);
+            const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
+            const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls);
+
+            expect(toolCallingAccuracy).not.toEqual(0);
+            expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5);
+        });
+    });
+}
@@ -0,0 +1,62 @@
+import { BaseChatModel } from "@langchain/core/language_models/chat_models";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { ChatOllama } from "@langchain/ollama";
+import { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
+
+type ToolResultForOllama = string;
+export type AcceptableToolResponse = CallToolResult | ToolResultForOllama;
+
+export interface Model<M extends BaseChatModel = BaseChatModel, T extends AcceptableToolResponse = CallToolResult> {
+    isAvailable(): boolean;
+    getLangChainModel(): M;
+    transformToolResult(callToolResult: CallToolResult): T;
+}
+
+export class GeminiModel implements Model<ChatGoogleGenerativeAI> {
+    constructor(readonly modelName: string) {}
+
+    isAvailable(): boolean {
+        return !!process.env.MDB_GEMINI_API_KEY;
+    }
+
+    getLangChainModel(): ChatGoogleGenerativeAI {
+        return new ChatGoogleGenerativeAI({
+            model: this.modelName,
+            apiKey: process.env.MDB_GEMINI_API_KEY,
+        });
+    }
+
+    transformToolResult(callToolResult: CallToolResult) {
+        return callToolResult;
+    }
+}
+
+export class OllamaModel implements Model<ChatOllama, ToolResultForOllama> {
+    constructor(readonly modelName: string) {}
+
+    isAvailable(): boolean {
+        return !!process.env.MDB_GEMINI_API_KEY;
+    }
+
+    getLangChainModel(): ChatOllama {
+        return new ChatOllama({
+            model: this.modelName,
+        });
+    }
+
+    transformToolResult(callToolResult: CallToolResult): ToolResultForOllama {
+        return JSON.stringify(callToolResult);
+    }
+}
+
+const ALL_TESTABLE_MODELS = [
+    // new GeminiModel("gemini-1.5-flash"),
+    // new GeminiModel("gemini-2.0-flash"),
+    new OllamaModel("qwen3:latest"),
+];
+
+export type TestableModels = ReturnType<typeof getAvailableModels>;
+
+export function getAvailableModels() {
+    return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable());
+}